From ce6369ed44abd687b154e5ea7ab6ad73d22f35cc Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Mon, 30 Sep 2024 10:40:01 -0700 Subject: [PATCH 001/151] Fix build break when building RISCVInstrInfo.cpp with MSVC (#110342) After #109774 MSVC is failing to build LLVM with the error: ``` llvm\lib\Target\RISCV\RISCVInstrInfo.cpp(782): warning C4018: '<': signed/unsigned mismatch ``` Fix is ensure that the RHS is an unsigned integer. From 3e5e48a17321089e802fe41400a356e41dbe347b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 30 Sep 2024 10:41:59 -0700 Subject: [PATCH 002/151] [flang][cuda] Fix buildbot failure (#110540) https://lab.llvm.org/buildbot/#/builders/89/builds/7488 --- flang/lib/Optimizer/Transforms/CufOpConversion.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp index a1405d0e85c1d..a80ca7a125abd 100644 --- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp @@ -370,18 +370,20 @@ struct CufDataTransferOpConversion mlir::Type srcTy = fir::unwrapRefType(op.getSrc().getType()); mlir::Type dstTy = fir::unwrapRefType(op.getDst().getType()); - unsigned mode; + mlir::Location loc = op.getLoc(); + unsigned mode = 0; if (op.getTransferKind() == cuf::DataTransferKind::HostDevice) { mode = kHostToDevice; } else if (op.getTransferKind() == cuf::DataTransferKind::DeviceHost) { mode = kDeviceToHost; } else if (op.getTransferKind() == cuf::DataTransferKind::DeviceDevice) { mode = kDeviceToDevice; + } else { + mlir::emitError(loc, "unsupported transfer kind\n"); } auto mod = op->getParentOfType(); fir::FirOpBuilder builder(rewriter, mod); - mlir::Location loc = op.getLoc(); fir::KindMapping kindMap{fir::getKindMapping(mod)}; mlir::Value modeValue = builder.createIntegerConstant(loc, builder.getI32Type(), mode); From 5d45815473496db4b041a008e60be17bd78c06ae Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Mon, 30 Sep 2024 13:51:41 -0400 Subject: [PATCH 003/151] [docs][amdgpu] Update kernarg documentation for gfx90a (#109690) Update the docs to mention that kernel argument preloading is not supported on MI210. --- llvm/docs/AMDGPUUsage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 38300863f7889..0b8f2e4f96715 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -361,7 +361,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - tgsplit flat - *rocm-amdhsa* - AMD Instinct MI250 Accelerator - xnack scratch - *rocm-amdhsa* - AMD Instinct MI250X Accelerator - kernarg preload - Packed - work-item + (except MI210) work-item IDs ``gfx90c`` ``amdgcn`` APU - xnack - Absolute - *pal-amdpal* - Ryzen 7 4700G From 85181788576151cc4b52d38d9b52d04f26179530 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 30 Sep 2024 20:07:28 +0200 Subject: [PATCH 004/151] [clang][bytecode] Implement ia32_bextr builitns (#110513) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 36 ++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 82ed6d9e7a2ff..eb59cf3e9b1e3 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -14,6 +14,7 @@ #include "clang/AST/OSLog.h" #include "clang/AST/RecordLayout.h" #include "clang/Basic/Builtins.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/Support/SipHash.h" @@ -1152,6 +1153,33 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, return false; } +static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + PrimType ValT = *S.Ctx.classify(Call->getArg(0)); + PrimType IndexT = *S.Ctx.classify(Call->getArg(1)); + APSInt Val = peekToAPSInt(S.Stk, ValT, + align(primSize(ValT)) + align(primSize(IndexT))); + APSInt Index = peekToAPSInt(S.Stk, IndexT); + + unsigned BitWidth = Val.getBitWidth(); + uint64_t Shift = Index.extractBitsAsZExtValue(8, 0); + uint64_t Length = Index.extractBitsAsZExtValue(8, 8); + Length = Length > BitWidth ? BitWidth : Length; + + // Handle out of bounds cases. + if (Length == 0 || Shift >= BitWidth) { + pushInteger(S, 0, Call->getType()); + return true; + } + + uint64_t Result = Val.getZExtValue() >> Shift; + Result &= llvm::maskTrailingOnes(Length); + pushInteger(S, Result, Call->getType()); + return true; +} + static bool interp__builtin_os_log_format_buffer_size(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, @@ -1737,6 +1765,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case clang::X86::BI__builtin_ia32_bextr_u32: + case clang::X86::BI__builtin_ia32_bextr_u64: + case clang::X86::BI__builtin_ia32_bextri_u32: + case clang::X86::BI__builtin_ia32_bextri_u64: + if (!interp__builtin_ia32_bextr(S, OpPC, Frame, F, Call)) + return false; + break; + case Builtin::BI__builtin_os_log_format_buffer_size: if (!interp__builtin_os_log_format_buffer_size(S, OpPC, Frame, F, Call)) return false; From bbdca53bad670b69e299b1094e2d654a3a76e0dd Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Mon, 30 Sep 2024 13:09:06 -0500 Subject: [PATCH 005/151] [KnownBitsTest] Add optimality tests to some optimal impls; NFC Adding optimality test to `add`, `sub`, `avgCeilU` and `avgFloorU` --- llvm/unittests/Support/KnownBitsTest.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp index b6e16f809ea77..b701757aed5eb 100644 --- a/llvm/unittests/Support/KnownBitsTest.cpp +++ b/llvm/unittests/Support/KnownBitsTest.cpp @@ -305,15 +305,13 @@ TEST(KnownBitsTest, BinaryExhaustive) { [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::add(Known1, Known2); }, - [](const APInt &N1, const APInt &N2) { return N1 + N2; }, - /*CheckOptimality=*/false); + [](const APInt &N1, const APInt &N2) { return N1 + N2; }); testBinaryOpExhaustive( "sub", [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::sub(Known1, Known2); }, - [](const APInt &N1, const APInt &N2) { return N1 - N2; }, - /*CheckOptimality=*/false); + [](const APInt &N1, const APInt &N2) { return N1 - N2; }); testBinaryOpExhaustive("umax", KnownBits::umax, APIntOps::umax); testBinaryOpExhaustive("umin", KnownBits::umin, APIntOps::umin); testBinaryOpExhaustive("smax", KnownBits::smax, APIntOps::smax); @@ -524,16 +522,15 @@ TEST(KnownBitsTest, BinaryExhaustive) { /*CheckOptimality=*/false); testBinaryOpExhaustive("avgFloorS", KnownBits::avgFloorS, APIntOps::avgFloorS, - false); + /*CheckOptimality=*/false); - testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU, APIntOps::avgFloorU, - false); + testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU, + APIntOps::avgFloorU); - testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU, - false); + testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU); testBinaryOpExhaustive("avgCeilS", KnownBits::avgCeilS, APIntOps::avgCeilS, - false); + /*CheckOptimality=*/false); } TEST(KnownBitsTest, UnaryExhaustive) { From 607c525110ed174fa2963fc2b621109f1a95111b Mon Sep 17 00:00:00 2001 From: Maurice Heumann Date: Mon, 30 Sep 2024 20:14:38 +0200 Subject: [PATCH 006/151] [ARM64] [Windows] Mark block address as taken when expanding catchrets (#109252) This fixes issue #109250 The issue happens during the `MachineBlockPlacement` pass. The block, whose address was previously not taken, is deemed redundant by the pass and subsequently replaced using `MachineBasicBlock::ReplaceUsesOfBlockWith` in `BranchFolding`. ReplaceUsesOfBlockWith only replaces uses in the terminator. However, `expandPostRAPseudo` introduces new block uses when expanding catchrets. These uses do not get replaced, which results in undefined label errors later on. Marking the block addresss as taken prevents the replacement of the block, without also replacing non-terminator uses. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 1 + llvm/test/CodeGen/AArch64/pr58516.ll | 3 +- .../wineh-catchret-label-generation.ll | 97 +++++++++++++++++++ llvm/test/CodeGen/AArch64/wineh-try-catch.ll | 2 +- 4 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 3b38a5f78dee5..32bc0e7d0d647 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1994,6 +1994,7 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(AArch64::X0) .addMBB(TargetMBB) .addImm(0); + TargetMBB->setMachineBlockAddressTaken(); return true; } diff --git a/llvm/test/CodeGen/AArch64/pr58516.ll b/llvm/test/CodeGen/AArch64/pr58516.ll index b4840f01ce116..3361ded48d4e2 100644 --- a/llvm/test/CodeGen/AArch64/pr58516.ll +++ b/llvm/test/CodeGen/AArch64/pr58516.ll @@ -40,7 +40,8 @@ define void @osfx(ptr %this) comdat personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: // %bb.1: // %invoke.cont12 ; CHECK-NEXT: str wzr, [x20] ; CHECK-NEXT: str wzr, [x21] -; CHECK-NEXT: .LBB0_2: // %try.cont +; CHECK-NEXT: .LBB0_2: // Block address taken +; CHECK-NEXT: // %try.cont ; CHECK-NEXT: $ehgcr_0_2: ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: sub sp, x29, #24 diff --git a/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll new file mode 100644 index 0000000000000..1f30865c98e19 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple aarch64-unknown-windows-msvc %s -o - | FileCheck %s + +declare i32 @__CxxFrameHandler3(...) + +declare void @llvm.seh.try.begin() #0 + +define fastcc ptr @test_function(i1 %0, ptr %_Fmtfl.i.i, i1 %1) personality ptr @__CxxFrameHandler3 { +; CHECK-LABEL: test_function: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .seh_proc test_function +; CHECK-NEXT: .seh_handler __CxxFrameHandler3, @unwind, @except +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 32 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .seh_set_fp +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: mov x3, #-2 // =0xfffffffffffffffe +; CHECK-NEXT: stur x3, [x29, #16] +; CHECK-NEXT: tbz w0, #0, .LBB0_3 +; CHECK-NEXT: // %bb.1: // %invoke.cont.i124 +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: // Block address taken +; CHECK-NEXT: // %some-block +; CHECK-NEXT: $ehgcr_0_2: +; CHECK-NEXT: .LBB0_3: // %left-block526 +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: .LBB0_4: // %common.ret1 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 32 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_handlerdata +; CHECK-NEXT: .word ($cppxdata$test_function)@IMGREL +; CHECK-NEXT: .text +; CHECK-NEXT: .seh_endproc +; CHECK-NEXT: .def "?catch$5@?0?test_function@4HA"; +; CHECK-NEXT: .scl 3; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: "?catch$5@?0?test_function@4HA": +; CHECK-NEXT: .seh_proc "?catch$5@?0?test_function@4HA" +; CHECK-NEXT: .seh_handler __CxxFrameHandler3, @unwind, @except +; CHECK-NEXT: .LBB0_5: // %catch.i +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: adrp x0, .LBB0_2 +; CHECK-NEXT: add x0, x0, .LBB0_2 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +entry: + br i1 %0, label %right-block527, label %left-block526 + +common.ret1: + %common.ret1.op = phi ptr [ null, %left-block530 ], [ null, %some-block ], [ %_Fmtfl.i.i, %invoke.cont.i124 ], [ null, %left-block526 ] + ret ptr %common.ret1.op + +invoke.cont.i124: + %.not657 = icmp eq i32 1, 0 + br i1 %.not657, label %some-block, label %common.ret1 + +catch.dispatch.i: + %2 = catchswitch within none [label %catch.i] unwind to caller + +catch.i: + %3 = catchpad within %2 [ptr null, i32 0, ptr null] + catchret from %3 to label %some-block + +some-block: + br label %common.ret1 + +left-block526: + br i1 %1, label %common.ret1, label %left-block530 + +right-block527: + invoke void @llvm.seh.try.begin() + to label %invoke.cont.i124 unwind label %catch.dispatch.i + +left-block530: + %.not = icmp eq i32 0, 0 + br label %common.ret1 +} + +attributes #0 = { nounwind willreturn memory(write) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"eh-asynch", i32 1} diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll index b27e5374b2576..c3b5a8968d7bb 100644 --- a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll +++ b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll @@ -41,7 +41,7 @@ ; CHECK-LABEL: .Ltmp0: ; CHECK: bl "?func2@@YAHXZ -; CHECK: [[CATCHRETDEST:.LBB0_[0-9]+]]: // %catchret.dest +; CHECK: [[CATCHRETDEST:.LBB0_[0-9]+]]: // Block address taken ; Check the catch funclet. ; CHECK-LABEL: "?catch$4@?0??func@@YAHXZ@4HA": From 41145feb77ddcb90b6628e3d11eea69e1ecf71c2 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 30 Sep 2024 14:17:05 -0400 Subject: [PATCH 007/151] [libc++][modules] Rewrite the modulemap to have fewer top-level modules (#110501) This is a re-application of bc6bd3bc1e9 which was reverted in f11abac6524 because it broke the Clang pre-commit CI. Original commit message: This patch rewrites the modulemap to have fewer top-level modules. Previously, our modulemap had one top level module for each header in the library, including private headers. This had the well-known problem of making compilation times terrible, in addition to being somewhat against the design principles of Clang modules. This patch provides almost an order of magnitude compilation time improvement when building modularized code (certainly subject to variations). For example, including without a module cache went from 22.4 seconds to 1.6 seconds, a 14x improvement. To achieve this, one might be tempted to simply put all the headers in a single top-level module. Unfortunately, this doesn't work because libc++ provides C compatibility headers (e.g. stdlib.h) which create cycles when the C Standard Library headers are modularized too. This is especially tricky since base systems are usually not modularized: as far as I know, only Xcode 16 beta contains a modularized SDK that makes this issue visible. To understand it, imagine we have the following setup: // in libc++'s include/c++/v1/module.modulemap module std { header stddef.h header stdlib.h } // in the C library's include/module.modulemap module clib { header stddef.h header stdlib.h } Now, imagine that the C library's includes , perhaps as an implementation detail. When building the `std` module, libc++'s header does `#include_next ` to get the C library's , so libc++ depends on the `clib` module. However, remember that the C library's header includes as an implementation detail. Since the header search paths for libc++ are (and must be) before the search paths for the C library, the C library ends up including libc++'s , which means it depends on the `std` module. That's a cycle. To solve this issue, this patch creates one top-level module for each C compatibility header. The rest of the libc++ headers are located in a single top-level `std` module, with two main exceptions. First, the module containing configuration headers (e.g. <__config>) has its own top-level module too, because those headers are included by the C compatibility headers. Second, we create a top-level std_core module that contains several dependency-free utilities used (directly or indirectly) from the __math subdirectory. This is needed because __math pulls in a bunch of stuff, and __math is used from the C compatibility header . As a direct benefit of this change, we don't need to generate an artificial __std_clang_module header anymore to provide a monolithic `std` module, since our modulemap does it naturally by construction. A next step after this change would be to look into whether math.h really needs to include the contents of __math, and if so, whether libc++'s math.h truly needs to include the C library's math.h header. Removing either dependency would break this annoying cycle. Thanks to Eric Fiselier for pointing out this approach during a recent meeting. This wasn't viable before some recent refactoring, but wrapping everything (except the C headers) in a large module is by far the simplest and the most effective way of doing this. Fixes #86193 --- libcxx/include/CMakeLists.txt | 1 - libcxx/include/__format/formatter_integral.h | 1 + libcxx/include/__std_clang_module | 193 - libcxx/include/module.modulemap | 4237 +++++++++-------- .../test/libcxx/clang_modules_include.gen.py | 14 +- .../utility/utility.synop/includes.pass.cpp | 23 - libcxx/utils/CMakeLists.txt | 5 - .../utils/generate_std_clang_module_header.py | 63 - 8 files changed, 2188 insertions(+), 2349 deletions(-) delete mode 100644 libcxx/include/__std_clang_module delete mode 100644 libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp delete mode 100644 libcxx/utils/generate_std_clang_module_header.py diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 23f8f71b42439..9bd1b41b8bfac 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -687,7 +687,6 @@ set(files __ranges/views.h __ranges/zip_view.h __split_buffer - __std_clang_module __std_mbstate_t.h __stop_token/atomic_unique_lock.h __stop_token/intrusive_list_view.h diff --git a/libcxx/include/__format/formatter_integral.h b/libcxx/include/__format/formatter_integral.h index beed3ab8d93df..0c04cce855a08 100644 --- a/libcxx/include/__format/formatter_integral.h +++ b/libcxx/include/__format/formatter_integral.h @@ -27,6 +27,7 @@ #include <__type_traits/make_unsigned.h> #include <__utility/unreachable.h> #include +#include #include #include #include diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module deleted file mode 100644 index a21ed26addfe8..0000000000000 --- a/libcxx/include/__std_clang_module +++ /dev/null @@ -1,193 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// WARNING, this entire header is generated by -// utils/generate_std_clang_module_header.py -// DO NOT MODIFY! - -// This header should not be directly included, it's exclusively to import all -// of the libc++ public clang modules for the `std` clang module to export. In -// other words, it's to facilitate `@import std;` in Objective-C++ and `import std` -// in Swift to expose all of the libc++ interfaces. This is generally not -// recommended, however there are some clients that need to import all of libc++ -// without knowing what "all" is. -#if !__building_module(std) -# error "Do not include this header directly, include individual headers instead" -#endif - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index ef32a30160b5d..dee9b0b88b794 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1,2124 +1,2239 @@ -// Main C++ standard library interfaces -module std_algorithm [system] { - header "algorithm" - export * -} -module std_any [system] { - header "any" - export * -} -module std_array [system] { - header "array" - export * -} -module std_atomic [system] { - header "atomic" - export * -} -module std_barrier [system] { - header "barrier" - export * -} -module std_bit [system] { - header "bit" - export * -} -module std_bitset [system] { - header "bitset" - export * -} -module std_charconv [system] { - header "charconv" - module chars_format { header "__charconv/chars_format.h" } - module from_chars_integral { header "__charconv/from_chars_integral.h" } - module from_chars_result { header "__charconv/from_chars_result.h" } - module tables { header "__charconv/tables.h" } - module to_chars { header "__charconv/to_chars.h" } - module to_chars_base_10 { header "__charconv/to_chars_base_10.h" } - module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" } - module to_chars_integral { header "__charconv/to_chars_integral.h" } - module to_chars_result { header "__charconv/to_chars_result.h" } - module traits { header "__charconv/traits.h" } - export * -} -module std_chrono [system] { - header "chrono" - export * -} -module std_codecvt [system] { - header "codecvt" - export * -} -module std_compare [system] { - header "compare" - export * -} -module std_complex [system] { - header "complex" - export * -} -module std_concepts [system] { - header "concepts" - export * -} -module std_condition_variable [system] { - header "condition_variable" - module condition_variable { header "__condition_variable/condition_variable.h" } - export * -} -module std_coroutine [system] { - header "coroutine" - module coroutine_handle { header "__coroutine/coroutine_handle.h" } - module coroutine_traits { header "__coroutine/coroutine_traits.h" } - module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" } - module trivial_awaitables { header "__coroutine/trivial_awaitables.h" } - export * -} -module std_deque [system] { - header "deque" - export * -} -module std_exception [system] { - header "exception" - export * -} -module std_execution [system] { - header "execution" - export * -} -module std_expected [system] { - header "expected" - export * -} -module std_filesystem [system] { - header "filesystem" - module copy_options { header "__filesystem/copy_options.h" } - module directory_entry { header "__filesystem/directory_entry.h" } - module directory_iterator { header "__filesystem/directory_iterator.h" } - module directory_options { header "__filesystem/directory_options.h" } - module file_status { header "__filesystem/file_status.h" } - module file_time_type { header "__filesystem/file_time_type.h" } - module file_type { header "__filesystem/file_type.h" } - module filesystem_error { - header "__filesystem/filesystem_error.h" - export std_private_memory_shared_ptr - } - module operations { header "__filesystem/operations.h" } - module path { - header "__filesystem/path.h" - export std_string // returned by various methods - } - module path_iterator { header "__filesystem/path_iterator.h" } - module perm_options { header "__filesystem/perm_options.h" } - module perms { header "__filesystem/perms.h" } - module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" } - module space_info { header "__filesystem/space_info.h" } - module u8path { header "__filesystem/u8path.h" } - export * -} -module std_format [system] { - header "format" - export * -} -module std_forward_list [system] { - header "forward_list" - export * -} -module std_fstream [system] { - header "fstream" - export * -} -module std_functional [system] { - header "functional" - export * -} -module std_future [system] { - header "future" - export * -} -module std_initializer_list [system] { - header "initializer_list" - export * -} -module std_iomanip [system] { - header "iomanip" - export * -} -module std_ios [system] { - header "ios" - export * -} -module std_iosfwd [system] { - header "iosfwd" - export * -} -module std_iostream [system] { - header "iostream" - export * -} -module std_istream [system] { - header "istream" - export * -} -module std_iterator [system] { - header "iterator" - export * -} -module std_latch [system] { - header "latch" - export * -} -module std_limits [system] { - header "limits" - export * -} -module std_list [system] { - header "list" - export * -} -module std_locale [system] { - header "locale" - export * -} -module std_map [system] { - header "map" - export * +// This module contains headers related to the configuration of the library. These headers +// are free of any dependency on the rest of libc++. +module std_config [system] { + textual header "__config" + textual header "__configuration/abi.h" + textual header "__configuration/availability.h" + textual header "__configuration/compiler.h" + textual header "__configuration/language.h" + textual header "__configuration/platform.h" + textual header "version" } -module std_mdspan [system] { - header "mdspan" - module default_accessor { header "__mdspan/default_accessor.h" } - module extents { header "__mdspan/extents.h" } - module fwd { header "__fwd/mdspan.h" } - module layout_left { header "__mdspan/layout_left.h" } - module layout_right { header "__mdspan/layout_right.h" } - module layout_stride { header "__mdspan/layout_stride.h" } - module mdspan { - header "__mdspan/mdspan.h" - export std_array // for strides() + +module std_core [system] { + module cstddef { + module byte { header "__cstddef/byte.h" } + module max_align_t { header "__cstddef/max_align_t.h" } + module nullptr_t { header "__cstddef/nullptr_t.h" } + module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } + module size_t { header "__cstddef/size_t.h" } } - export * -} -module std_memory [system] { - header "memory" - export * -} -module std_memory_resource [system] { - header "memory_resource" - export * -} -module std_mutex [system] { - header "mutex" - export * -} -module std_new [system] { - header "new" - export * -} -module std_numbers [system] { - header "numbers" - export * -} -module std_numeric [system] { - header "numeric" - export * -} -module std_optional [system] { - header "optional" - export * -} -module std_ostream [system] { - header "ostream" - export * -} -module std_print [system] { - header "print" - export * -} -module std_queue [system] { - header "queue" - export * -} -module std_random [system] { - header "random" - export * -} -module std_ranges [system] { - header "ranges" - export * -} -module std_ratio [system] { - header "ratio" - export * -} -module std_regex [system] { - header "regex" - export * -} -module std_scoped_allocator [system] { - header "scoped_allocator" - export * -} -module std_semaphore [system] { - header "semaphore" - export * -} -module std_set [system] { - header "set" - export * -} -module std_shared_mutex [system] { - header "shared_mutex" - export std_version -} -module std_source_location [system] { - header "source_location" - export * -} -module std_span [system] { - header "span" - export std_private_ranges_enable_borrowed_range - export std_version - export std_private_span_span_fwd -} -module std_sstream [system] { - header "sstream" - export * -} -module std_stack [system] { - header "stack" - export * -} -module std_stdexcept [system] { - header "stdexcept" - export * -} -module std_stop_token [system] { - header "stop_token" - private header "__stop_token/atomic_unique_lock.h" - private header "__stop_token/intrusive_list_view.h" - private header "__stop_token/intrusive_shared_ptr.h" - private header "__stop_token/stop_callback.h" - private header "__stop_token/stop_source.h" - private header "__stop_token/stop_state.h" - private header "__stop_token/stop_token.h" - export * -} -module std_streambuf [system] { - header "streambuf" - export * -} -module std_string [system] { - header "string" - export * -} -module std_string_view [system] { - header "string_view" - export * -} -module std_strstream [system] { - header "strstream" - export * -} -module std_syncstream [system] { - header "syncstream" - export * -} -module std_system_error [system] { - header "system_error" - export * -} -module std_thread [system] { - header "thread" - export * -} -module std_tuple [system] { - header "tuple" - export * -} -module std_type_traits [system] { - header "type_traits" - export * -} -module std_typeindex [system] { - header "typeindex" - export * -} -module std_typeinfo [system] { - header "typeinfo" - export * -} -module std_unordered_map [system] { - header "unordered_map" - export * -} -module std_unordered_set [system] { - header "unordered_set" - export * -} -module std_utility [system] { - header "utility" - export * -} -module std_valarray [system] { - header "valarray" - export * -} -module std_variant [system] { - header "variant" - export * -} -module std_vector [system] { - header "vector" - export * -} -module std_version [system] { - header "version" - export * -} -// C standard library interface wrappers -module std_cassert [system] { - // 's use of NDEBUG requires textual inclusion. - textual header "cassert" -} -module std_ccomplex [system] { - header "ccomplex" - export * -} -module std_cctype [system] { - header "cctype" - export * -} -module std_cerrno [system] { - header "cerrno" - export * -} -module std_cfenv [system] { - header "cfenv" - export * -} -module std_cfloat [system] { - header "cfloat" - export * -} -module std_cinttypes [system] { - header "cinttypes" - export * -} -module std_ciso646 [system] { - header "ciso646" - export * -} -module std_climits [system] { - header "climits" - export * -} -module std_clocale [system] { - header "clocale" - export * -} -module std_cmath [system] { - header "cmath" - export * -} -module std_csetjmp [system] { - header "csetjmp" - export * -} -module std_csignal [system] { - header "csignal" - export * -} -// FIXME: is missing. -module std_cstdarg [system] { - header "cstdarg" - export * -} -module std_cstdbool [system] { - header "cstdbool" - export * -} -module std_cstddef [system] { - header "cstddef" - module byte { header "__cstddef/byte.h" } - module max_align_t { header "__cstddef/max_align_t.h" } - module nullptr_t { header "__cstddef/nullptr_t.h" } - module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } - module size_t { header "__cstddef/size_t.h" } - export * -} -module std_cstdint [system] { - header "cstdint" - export * -} -module std_cstdio [system] { - header "cstdio" - export * -} -module std_cstdlib [system] { - header "cstdlib" - export * -} -module std_cstring [system] { - header "cstring" - export * -} -module std_ctgmath [system] { - header "ctgmath" - export * -} -module std_ctime [system] { - header "ctime" - export * -} -module std_cuchar [system] { - header "cuchar" - export * -} -module std_cwchar [system] { - header "cwchar" - export * -} -module std_cwctype [system] { - header "cwctype" - export * -} + module cstdint { + header "cstdint" + export * + } -// C standard library interfaces augmented/replaced in C++ -// provided by C library. -module std_complex_h [system] { - header "complex.h" - export * -} -module std_ctype_h [system] { - header "ctype.h" - export * -} -module std_errno_h [system] { - header "errno.h" - export * -} -module std_fenv_h [system] { - header "fenv.h" - export * -} -module std_float_h [system] { - header "float.h" - export * -} -module std_inttypes_h [system] { - header "inttypes.h" - export * -} -// provided by compiler. -module std_locale_h [system] { - header "locale.h" - export * -} -module std_math_h [system] { - header "math.h" - export * -} -// provided by C library. -// provided by C library. -// FIXME: is missing. -// provided by compiler. -module std_stdatomic_h [system] { - header "stdatomic.h" - export * -} -module std_stdbool_h [system] { - // 's __bool_true_false_are_defined macro requires textual inclusion. - textual header "stdbool.h" - export * -} -module std_stddef_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stddef.h" - export * -} -module std_stdint_h [system] { - header "stdint.h" - export * -} -module std_stdio_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stdio.h" - export * -} -module std_stdlib_h [system] { - // 's __need_* macros require textual inclusion. - textual header "stdlib.h" - export * -} -module std_string_h [system] { - header "string.h" - export * -} -module std_tgmath_h [system] { - header "tgmath.h" - export * -} -module std_uchar_h [system] { - header "uchar.h" - export * -} -// provided by C library. -module std_wchar_h [system] { - // 's __need_* macros require textual inclusion. - textual header "wchar.h" - export * -} -module std_wctype_h [system] { - header "wctype.h" - export * -} + module fwd { + module byte { header "__fwd/byte.h" } + module functional { header "__fwd/functional.h" } + module pair { header "__fwd/pair.h" } + module tuple { header "__fwd/tuple.h" } + } + + module limits { + header "limits" + export * + } + + module math { + module abs { header "__math/abs.h" } + module copysign { header "__math/copysign.h" } + module error_functions { header "__math/error_functions.h" } + module exponential_functions { header "__math/exponential_functions.h" } + module fdim { header "__math/fdim.h" } + module fma { header "__math/fma.h" } + module gamma { header "__math/gamma.h" } + module hyperbolic_functions { header "__math/hyperbolic_functions.h" } + module hypot { header "__math/hypot.h" } + module inverse_hyperbolic_functions { header "__math/inverse_hyperbolic_functions.h" } + module inverse_trigonometric_functions { header "__math/inverse_trigonometric_functions.h" } + module logarithms { header "__math/logarithms.h" } + module min_max { header "__math/min_max.h" } + module modulo { header "__math/modulo.h" } + module remainder { header "__math/remainder.h" } + module roots { header "__math/roots.h" } + module rounding_functions { header "__math/rounding_functions.h" } + module special_functions { header "__math/special_functions.h" } + module traits { header "__math/traits.h" } + module trigonometric_functions { header "__math/trigonometric_functions.h" } + } + + module type_traits { + module add_const { header "__type_traits/add_const.h" } + module add_cv { header "__type_traits/add_cv.h" } + module add_lvalue_reference { header "__type_traits/add_lvalue_reference.h" } + module add_pointer { header "__type_traits/add_pointer.h" } + module add_rvalue_reference { header "__type_traits/add_rvalue_reference.h" } + module add_volatile { header "__type_traits/add_volatile.h" } + module aligned_storage { header "__type_traits/aligned_storage.h" } + module aligned_union { header "__type_traits/aligned_union.h" } + module alignment_of { header "__type_traits/alignment_of.h" } + module can_extract_key { header "__type_traits/can_extract_key.h" } + module common_reference { header "__type_traits/common_reference.h" } + module common_type { + header "__type_traits/common_type.h" + // We need to export everything from this module because common_type inherits from __builtin_common_type, + // which needs to be re-exported. + export * + } + module conditional { header "__type_traits/conditional.h" } + module conjunction { header "__type_traits/conjunction.h" } + module copy_cv { header "__type_traits/copy_cv.h" } + module copy_cvref { header "__type_traits/copy_cvref.h" } + module datasizeof { header "__type_traits/datasizeof.h" } + module decay { header "__type_traits/decay.h" } + module dependent_type { header "__type_traits/dependent_type.h" } + module desugars_to { header "__type_traits/desugars_to.h" } + module disjunction { header "__type_traits/disjunction.h" } + module enable_if { header "__type_traits/enable_if.h" } + module extent { header "__type_traits/extent.h" } + module has_unique_object_representation { header "__type_traits/has_unique_object_representation.h" } + module has_virtual_destructor { header "__type_traits/has_virtual_destructor.h" } + module integral_constant { header "__type_traits/integral_constant.h" } + module invoke { header "__type_traits/invoke.h" } + module is_abstract { + header "__type_traits/is_abstract.h" + export std_core.type_traits.integral_constant + } + module is_aggregate { + header "__type_traits/is_aggregate.h" + export std_core.type_traits.integral_constant + } + module is_allocator { + header "__type_traits/is_allocator.h" + export std_core.type_traits.integral_constant + } + module is_always_bitcastable { + header "__type_traits/is_always_bitcastable.h" + export std_core.type_traits.integral_constant + } + module is_arithmetic { + header "__type_traits/is_arithmetic.h" + export std_core.type_traits.integral_constant + } + module is_array { + header "__type_traits/is_array.h" + export std_core.type_traits.integral_constant + } + module is_assignable { + header "__type_traits/is_assignable.h" + export std_core.type_traits.integral_constant + } + module is_base_of { + header "__type_traits/is_base_of.h" + export std_core.type_traits.integral_constant + } + module is_bounded_array { + header "__type_traits/is_bounded_array.h" + export std_core.type_traits.integral_constant + } + module is_callable { + header "__type_traits/is_callable.h" + export std_core.type_traits.integral_constant + } + module is_char_like_type { + header "__type_traits/is_char_like_type.h" + export std_core.type_traits.integral_constant + } + module is_class { + header "__type_traits/is_class.h" + export std_core.type_traits.integral_constant + } + module is_compound { + header "__type_traits/is_compound.h" + export std_core.type_traits.integral_constant + } + module is_const { + header "__type_traits/is_const.h" + export std_core.type_traits.integral_constant + } + module is_constant_evaluated { + header "__type_traits/is_constant_evaluated.h" + export std_core.type_traits.integral_constant + } + module is_constructible { + header "__type_traits/is_constructible.h" + export std_core.type_traits.integral_constant + } + module is_convertible { + header "__type_traits/is_convertible.h" + export std_core.type_traits.integral_constant + } + module is_core_convertible { + header "__type_traits/is_core_convertible.h" + export std_core.type_traits.integral_constant + } + module is_destructible { + header "__type_traits/is_destructible.h" + export std_core.type_traits.integral_constant + } + module is_empty { + header "__type_traits/is_empty.h" + export std_core.type_traits.integral_constant + } + module is_enum { + header "__type_traits/is_enum.h" + export std_core.type_traits.integral_constant + } + module is_equality_comparable { + header "__type_traits/is_equality_comparable.h" + export std_core.type_traits.integral_constant + } + module is_execution_policy { + header "__type_traits/is_execution_policy.h" + export std_core.type_traits.integral_constant + } + module is_final { + header "__type_traits/is_final.h" + export std_core.type_traits.integral_constant + } + module is_floating_point { + header "__type_traits/is_floating_point.h" + export std_core.type_traits.integral_constant + } + module is_function { + header "__type_traits/is_function.h" + export std_core.type_traits.integral_constant + } + module is_fundamental { + header "__type_traits/is_fundamental.h" + export std_core.type_traits.integral_constant + } + module is_implicitly_default_constructible { + header "__type_traits/is_implicitly_default_constructible.h" + export std_core.type_traits.integral_constant + } + module is_integral { + header "__type_traits/is_integral.h" + export std_core.type_traits.integral_constant + } + module is_literal_type { + header "__type_traits/is_literal_type.h" + export std_core.type_traits.integral_constant + } + module is_member_pointer { + header "__type_traits/is_member_pointer.h" + export std_core.type_traits.integral_constant + } + module is_nothrow_assignable { + header "__type_traits/is_nothrow_assignable.h" + export std_core.type_traits.integral_constant + } + module is_nothrow_constructible { + header "__type_traits/is_nothrow_constructible.h" + export std_core.type_traits.integral_constant + } + module is_nothrow_convertible { + header "__type_traits/is_nothrow_convertible.h" + export std_core.type_traits.integral_constant + } + module is_nothrow_destructible { + header "__type_traits/is_nothrow_destructible.h" + export std_core.type_traits.integral_constant + } + module is_null_pointer { + header "__type_traits/is_null_pointer.h" + export std_core.type_traits.integral_constant + } + module is_object { + header "__type_traits/is_object.h" + export std_core.type_traits.integral_constant + } + module is_pod { + header "__type_traits/is_pod.h" + export std_core.type_traits.integral_constant + } + module is_pointer { + header "__type_traits/is_pointer.h" + export std_core.type_traits.integral_constant + } + module is_polymorphic { + header "__type_traits/is_polymorphic.h" + export std_core.type_traits.integral_constant + } + module is_primary_template { + header "__type_traits/is_primary_template.h" + export std_core.type_traits.integral_constant + } + module is_reference_wrapper { + header "__type_traits/is_reference_wrapper.h" + export std_core.type_traits.integral_constant + } + module is_reference { + header "__type_traits/is_reference.h" + export std_core.type_traits.integral_constant + } + module is_referenceable { + header "__type_traits/is_referenceable.h" + export std_core.type_traits.integral_constant + } + module is_same { + header "__type_traits/is_same.h" + export std_core.type_traits.integral_constant + } + module is_scalar { + header "__type_traits/is_scalar.h" + export std_core.type_traits.integral_constant + } + module is_signed_integer { + header "__type_traits/is_signed_integer.h" + export std_core.type_traits.integral_constant + } + module is_signed { + header "__type_traits/is_signed.h" + export std_core.type_traits.integral_constant + } + module is_specialization { + header "__type_traits/is_specialization.h" + export std_core.type_traits.integral_constant + } + module is_standard_layout { + header "__type_traits/is_standard_layout.h" + export std_core.type_traits.integral_constant + } + module is_swappable { + header "__type_traits/is_swappable.h" + export std_core.type_traits.integral_constant + } + module is_trivial { + header "__type_traits/is_trivial.h" + export std_core.type_traits.integral_constant + } + module is_trivially_assignable { + header "__type_traits/is_trivially_assignable.h" + export std_core.type_traits.integral_constant + } + module is_trivially_constructible { + header "__type_traits/is_trivially_constructible.h" + export std_core.type_traits.integral_constant + } + module is_trivially_copyable { + header "__type_traits/is_trivially_copyable.h" + export std_core.type_traits.integral_constant + } + module is_trivially_destructible { + header "__type_traits/is_trivially_destructible.h" + export std_core.type_traits.integral_constant + } + module is_trivially_lexicographically_comparable { + header "__type_traits/is_trivially_lexicographically_comparable.h" + export std_core.type_traits.integral_constant + } + module is_trivially_relocatable { + header "__type_traits/is_trivially_relocatable.h" + export std_core.type_traits.integral_constant + } + module is_unbounded_array { + header "__type_traits/is_unbounded_array.h" + export std_core.type_traits.integral_constant + } + module is_union { + header "__type_traits/is_union.h" + export std_core.type_traits.integral_constant + } + module is_unsigned_integer { + header "__type_traits/is_unsigned_integer.h" + export std_core.type_traits.integral_constant + } + module is_unsigned { + header "__type_traits/is_unsigned.h" + export std_core.type_traits.integral_constant + } + module is_valid_expansion { + header "__type_traits/is_valid_expansion.h" + export std_core.type_traits.integral_constant + } + module is_void { + header "__type_traits/is_void.h" + export std_core.type_traits.integral_constant + } + module is_volatile { + header "__type_traits/is_volatile.h" + export std_core.type_traits.integral_constant + } + module lazy { header "__type_traits/lazy.h" } + module make_32_64_or_128_bit { header "__type_traits/make_32_64_or_128_bit.h" } + module make_const_lvalue_ref { header "__type_traits/make_const_lvalue_ref.h" } + module make_signed { header "__type_traits/make_signed.h" } + module make_unsigned { header "__type_traits/make_unsigned.h" } + module maybe_const { header "__type_traits/maybe_const.h" } + module nat { header "__type_traits/nat.h" } + module negation { header "__type_traits/negation.h" } + module promote { header "__type_traits/promote.h" } + module rank { header "__type_traits/rank.h" } + module remove_all_extents { header "__type_traits/remove_all_extents.h" } + module remove_const_ref { header "__type_traits/remove_const_ref.h" } + module remove_const { header "__type_traits/remove_const.h" } + module remove_cv { header "__type_traits/remove_cv.h" } + module remove_cvref { header "__type_traits/remove_cvref.h" } + module remove_extent { header "__type_traits/remove_extent.h" } + module remove_pointer { header "__type_traits/remove_pointer.h" } + module remove_reference { header "__type_traits/remove_reference.h" } + module remove_volatile { header "__type_traits/remove_volatile.h" } + module result_of { header "__type_traits/result_of.h" } + module strip_signature { header "__type_traits/strip_signature.h" } + module type_identity { header "__type_traits/type_identity.h" } + module type_list { header "__type_traits/type_list.h" } + module underlying_type { header "__type_traits/underlying_type.h" } + module unwrap_ref { header "__type_traits/unwrap_ref.h" } + module void_t { header "__type_traits/void_t.h" } + + header "type_traits" + export * + } // module type_traits + + // Only the truly dependency-free parts of __utility are here + module utility_core { + module declval { header "__utility/declval.h" } + module empty { header "__utility/empty.h" } + module forward { header "__utility/forward.h" } + } +} // module std_core + +module std [system] { + module algorithm { + module adjacent_find { header "__algorithm/adjacent_find.h" } + module all_of { header "__algorithm/all_of.h" } + module any_of { header "__algorithm/any_of.h" } + module binary_search { header "__algorithm/binary_search.h" } + module clamp { header "__algorithm/clamp.h" } + module comp_ref_type { header "__algorithm/comp_ref_type.h" } + module comp { header "__algorithm/comp.h" } + module copy_backward { header "__algorithm/copy_backward.h" } + module copy_if { header "__algorithm/copy_if.h" } + module copy_move_common { header "__algorithm/copy_move_common.h" } + module copy_n { header "__algorithm/copy_n.h" } + module copy { header "__algorithm/copy.h" } + module count_if { header "__algorithm/count_if.h" } + module count { header "__algorithm/count.h" } + module equal_range { header "__algorithm/equal_range.h" } + module equal { header "__algorithm/equal.h" } + module fill_n { header "__algorithm/fill_n.h" } + module fill { header "__algorithm/fill.h" } + module find_end { header "__algorithm/find_end.h" } + module find_first_of { header "__algorithm/find_first_of.h" } + module find_if_not { header "__algorithm/find_if_not.h" } + module find_if { header "__algorithm/find_if.h" } + module find_segment_if { header "__algorithm/find_segment_if.h" } + module find { header "__algorithm/find.h" } + module for_each_n { header "__algorithm/for_each_n.h" } + module for_each_segment { header "__algorithm/for_each_segment.h" } + module for_each { header "__algorithm/for_each.h" } + module generate_n { header "__algorithm/generate_n.h" } + module generate { header "__algorithm/generate.h" } + module half_positive { header "__algorithm/half_positive.h" } + module in_found_result { header "__algorithm/in_found_result.h" } + module in_fun_result { header "__algorithm/in_fun_result.h" } + module in_in_out_result { header "__algorithm/in_in_out_result.h" } + module in_in_result { header "__algorithm/in_in_result.h" } + module in_out_out_result { header "__algorithm/in_out_out_result.h" } + module in_out_result { header "__algorithm/in_out_result.h" } + module includes { header "__algorithm/includes.h" } + module inplace_merge { header "__algorithm/inplace_merge.h" } + module is_heap_until { header "__algorithm/is_heap_until.h" } + module is_heap { header "__algorithm/is_heap.h" } + module is_partitioned { header "__algorithm/is_partitioned.h" } + module is_permutation { header "__algorithm/is_permutation.h" } + module is_sorted_until { header "__algorithm/is_sorted_until.h" } + module is_sorted { header "__algorithm/is_sorted.h" } + module iter_swap { header "__algorithm/iter_swap.h" } + module iterator_operations { + header "__algorithm/iterator_operations.h" + export std.iterator.advance + export std.iterator.distance + export std.iterator.iter_move + export std.iterator.iter_swap + export std.iterator.next + export std.iterator.prev + } + module lexicographical_compare_three_way { header "__algorithm/lexicographical_compare_three_way.h" } + module lexicographical_compare { header "__algorithm/lexicographical_compare.h" } + module lower_bound { header "__algorithm/lower_bound.h" } + module make_heap { header "__algorithm/make_heap.h" } + module make_projected { header "__algorithm/make_projected.h" } + module max_element { header "__algorithm/max_element.h" } + module max { header "__algorithm/max.h" } + module merge { header "__algorithm/merge.h" } + module min_element { header "__algorithm/min_element.h" } + module min_max_result { header "__algorithm/min_max_result.h" } + module min { header "__algorithm/min.h" } + module minmax_element { header "__algorithm/minmax_element.h" } + module minmax { + header "__algorithm/minmax.h" + export std.utility.pair // return type + } + module mismatch { + header "__algorithm/mismatch.h" + export std.utility.pair // return type + } + module move_backward { header "__algorithm/move_backward.h" } + module move { header "__algorithm/move.h" } + module next_permutation { header "__algorithm/next_permutation.h" } + module none_of { header "__algorithm/none_of.h" } + module nth_element { header "__algorithm/nth_element.h" } + module partial_sort_copy { header "__algorithm/partial_sort_copy.h" } + module partial_sort { header "__algorithm/partial_sort.h" } + module partition_copy { header "__algorithm/partition_copy.h" } + module partition_point { header "__algorithm/partition_point.h" } + module partition { header "__algorithm/partition.h" } + module pop_heap { header "__algorithm/pop_heap.h" } + module prev_permutation { header "__algorithm/prev_permutation.h" } + module pstl { header "__algorithm/pstl.h" } + module push_heap { header "__algorithm/push_heap.h" } + module ranges_adjacent_find { header "__algorithm/ranges_adjacent_find.h" } + module ranges_all_of { header "__algorithm/ranges_all_of.h" } + module ranges_any_of { header "__algorithm/ranges_any_of.h" } + module ranges_binary_search { + header "__algorithm/ranges_binary_search.h" + export std.functional.ranges_operations + } + module ranges_clamp { + header "__algorithm/ranges_clamp.h" + export std.functional.ranges_operations + } + module ranges_contains_subrange { + header "__algorithm/ranges_contains_subrange.h" + } + module ranges_contains { + header "__algorithm/ranges_contains.h" + } + module ranges_copy_backward { + header "__algorithm/ranges_copy_backward.h" + export std.algorithm.in_out_result + } + module ranges_copy_if { + header "__algorithm/ranges_copy_if.h" + export std.algorithm.in_out_result + } + module ranges_copy_n { + header "__algorithm/ranges_copy_n.h" + export std.algorithm.in_out_result + } + module ranges_copy { + header "__algorithm/ranges_copy.h" + export std.algorithm.in_out_result + } + module ranges_count_if { header "__algorithm/ranges_count_if.h" } + module ranges_count { header "__algorithm/ranges_count.h" } + module ranges_ends_with { header "__algorithm/ranges_ends_with.h" } + module ranges_equal_range { + header "__algorithm/ranges_equal_range.h" + export std.functional.ranges_operations + } + module ranges_equal { + header "__algorithm/ranges_equal.h" + export std.functional.identity + } + module ranges_fill_n { header "__algorithm/ranges_fill_n.h" } + module ranges_fill { header "__algorithm/ranges_fill.h" } + module ranges_find_end { header "__algorithm/ranges_find_end.h" } + module ranges_find_first_of { header "__algorithm/ranges_find_first_of.h" } + module ranges_find_if_not { header "__algorithm/ranges_find_if_not.h" } + module ranges_find_if { header "__algorithm/ranges_find_if.h" } + module ranges_find_last { header "__algorithm/ranges_find_last.h" } + module ranges_find { header "__algorithm/ranges_find.h" } + module ranges_fold { header "__algorithm/ranges_fold.h" } + module ranges_for_each_n { + header "__algorithm/ranges_for_each_n.h" + export std.algorithm.in_fun_result + } + module ranges_for_each { + header "__algorithm/ranges_for_each.h" + export std.algorithm.in_fun_result + } + module ranges_generate_n { + header "__algorithm/ranges_generate_n.h" + } + module ranges_generate { + header "__algorithm/ranges_generate.h" + } + module ranges_includes { + header "__algorithm/ranges_includes.h" + export std.functional.ranges_operations + } + module ranges_inplace_merge { + header "__algorithm/ranges_inplace_merge.h" + export std.functional.ranges_operations + } + module ranges_is_heap_until { + header "__algorithm/ranges_is_heap_until.h" + export std.functional.ranges_operations + } + module ranges_is_heap { + header "__algorithm/ranges_is_heap.h" + export std.functional.ranges_operations + } + module ranges_is_partitioned { + header "__algorithm/ranges_is_partitioned.h" + } + module ranges_is_permutation { + header "__algorithm/ranges_is_permutation.h" + } + module ranges_is_sorted_until { + header "__algorithm/ranges_is_sorted_until.h" + export std.functional.ranges_operations + } + module ranges_is_sorted { + header "__algorithm/ranges_is_sorted.h" + export std.functional.ranges_operations + } + module ranges_iterator_concept { + header "__algorithm/ranges_iterator_concept.h" + } + module ranges_lexicographical_compare { + header "__algorithm/ranges_lexicographical_compare.h" + export std.functional.ranges_operations + } + module ranges_lower_bound { + header "__algorithm/ranges_lower_bound.h" + export std.functional.ranges_operations + } + module ranges_make_heap { + header "__algorithm/ranges_make_heap.h" + export std.functional.ranges_operations + } + module ranges_max_element { + header "__algorithm/ranges_max_element.h" + export std.functional.ranges_operations + } + module ranges_max { + header "__algorithm/ranges_max.h" + export std.functional.ranges_operations + } + module ranges_merge { + header "__algorithm/ranges_merge.h" + export std.functional.ranges_operations + export std.algorithm.in_in_out_result + } + module ranges_min_element { + header "__algorithm/ranges_min_element.h" + export std.functional.ranges_operations + } + module ranges_min { + header "__algorithm/ranges_min.h" + export std.functional.ranges_operations + } + module ranges_minmax_element { + header "__algorithm/ranges_minmax_element.h" + export std.functional.ranges_operations + export std.algorithm.min_max_result + } + module ranges_minmax { + header "__algorithm/ranges_minmax.h" + export std.functional.ranges_operations + export std.algorithm.min_max_result + } + module ranges_mismatch { + header "__algorithm/ranges_mismatch.h" + export std.algorithm.in_in_result + } + module ranges_move_backward { + header "__algorithm/ranges_move_backward.h" + export std.algorithm.in_out_result + } + module ranges_move { + header "__algorithm/ranges_move.h" + export std.algorithm.in_out_result + } + module ranges_next_permutation { + header "__algorithm/ranges_next_permutation.h" + export std.functional.ranges_operations + export std.algorithm.in_found_result + } + module ranges_none_of { + header "__algorithm/ranges_none_of.h" + } + module ranges_nth_element { + header "__algorithm/ranges_nth_element.h" + export std.functional.ranges_operations + } + module ranges_partial_sort_copy { + header "__algorithm/ranges_partial_sort_copy.h" + export std.functional.ranges_operations + } + module ranges_partial_sort { + header "__algorithm/ranges_partial_sort.h" + export std.functional.ranges_operations + } + module ranges_partition_copy { + header "__algorithm/ranges_partition_copy.h" + export std.algorithm.in_out_out_result + } + module ranges_partition_point { + header "__algorithm/ranges_partition_point.h" + } + module ranges_partition { + header "__algorithm/ranges_partition.h" + } + module ranges_pop_heap { + header "__algorithm/ranges_pop_heap.h" + export std.functional.ranges_operations + } + module ranges_prev_permutation { + header "__algorithm/ranges_prev_permutation.h" + export std.functional.ranges_operations + export std.algorithm.in_found_result + } + module ranges_push_heap { + header "__algorithm/ranges_push_heap.h" + export std.functional.ranges_operations + } + module ranges_remove_copy_if { + header "__algorithm/ranges_remove_copy_if.h" + export std.algorithm.in_out_result + } + module ranges_remove_copy { + header "__algorithm/ranges_remove_copy.h" + export std.algorithm.in_out_result + } + module ranges_remove_if { + header "__algorithm/ranges_remove_if.h" + } + module ranges_remove { + header "__algorithm/ranges_remove.h" + } + module ranges_replace_copy_if { + header "__algorithm/ranges_replace_copy_if.h" + export std.algorithm.in_out_result + } + module ranges_replace_copy { + header "__algorithm/ranges_replace_copy.h" + export std.algorithm.in_out_result + } + module ranges_replace_if { + header "__algorithm/ranges_replace_if.h" + } + module ranges_replace { + header "__algorithm/ranges_replace.h" + } + module ranges_reverse_copy { + header "__algorithm/ranges_reverse_copy.h" + export std.algorithm.in_out_result + } + module ranges_reverse { + header "__algorithm/ranges_reverse.h" + } + module ranges_rotate_copy { + header "__algorithm/ranges_rotate_copy.h" + export std.algorithm.in_out_result + } + module ranges_rotate { header "__algorithm/ranges_rotate.h" } + module ranges_sample { header "__algorithm/ranges_sample.h" } + module ranges_search_n { header "__algorithm/ranges_search_n.h" } + module ranges_search { header "__algorithm/ranges_search.h" } + module ranges_set_difference { + header "__algorithm/ranges_set_difference.h" + export std.functional.ranges_operations + export std.algorithm.in_out_result + } + module ranges_set_intersection { + header "__algorithm/ranges_set_intersection.h" + export std.functional.ranges_operations + export std.algorithm.in_in_out_result + } + module ranges_set_symmetric_difference { + header "__algorithm/ranges_set_symmetric_difference.h" + export std.functional.ranges_operations + export std.algorithm.in_in_out_result + } + module ranges_set_union { + header "__algorithm/ranges_set_union.h" + export std.functional.ranges_operations + export std.algorithm.in_in_out_result + } + module ranges_shuffle { + header "__algorithm/ranges_shuffle.h" + } + module ranges_sort_heap { + header "__algorithm/ranges_sort_heap.h" + export std.functional.ranges_operations + } + module ranges_sort { + header "__algorithm/ranges_sort.h" + export std.functional.ranges_operations + } + module ranges_stable_partition { + header "__algorithm/ranges_stable_partition.h" + } + module ranges_stable_sort { + header "__algorithm/ranges_stable_sort.h" + export std.functional.ranges_operations + } + module ranges_starts_with { + header "__algorithm/ranges_starts_with.h" + } + module ranges_swap_ranges { + header "__algorithm/ranges_swap_ranges.h" + export std.algorithm.in_in_result + } + module ranges_transform { + header "__algorithm/ranges_transform.h" + export std.algorithm.in_out_result + export std.algorithm.in_in_out_result + } + module ranges_unique_copy { + header "__algorithm/ranges_unique_copy.h" + } + module ranges_unique { + header "__algorithm/ranges_unique.h" + } + module ranges_upper_bound { + header "__algorithm/ranges_upper_bound.h" + export std.functional.ranges_operations + } + module remove_copy_if { header "__algorithm/remove_copy_if.h" } + module remove_copy { header "__algorithm/remove_copy.h" } + module remove_if { header "__algorithm/remove_if.h" } + module remove { header "__algorithm/remove.h" } + module replace_copy_if { header "__algorithm/replace_copy_if.h" } + module replace_copy { header "__algorithm/replace_copy.h" } + module replace_if { header "__algorithm/replace_if.h" } + module replace { header "__algorithm/replace.h" } + module reverse_copy { header "__algorithm/reverse_copy.h" } + module reverse { header "__algorithm/reverse.h" } + module rotate_copy { header "__algorithm/rotate_copy.h" } + module rotate { header "__algorithm/rotate.h" } + module sample { header "__algorithm/sample.h" } + module search_n { header "__algorithm/search_n.h" } + module search { header "__algorithm/search.h" } + module set_difference { header "__algorithm/set_difference.h" } + module set_intersection { header "__algorithm/set_intersection.h" } + module set_symmetric_difference { header "__algorithm/set_symmetric_difference.h" } + module set_union { header "__algorithm/set_union.h" } + module shift_left { header "__algorithm/shift_left.h" } + module shift_right { header "__algorithm/shift_right.h" } + module shuffle { header "__algorithm/shuffle.h" } + module sift_down { header "__algorithm/sift_down.h" } + module simd_utils { header "__algorithm/simd_utils.h" } + module sort_heap { header "__algorithm/sort_heap.h" } + module sort { header "__algorithm/sort.h" } + module stable_partition { header "__algorithm/stable_partition.h" } + module stable_sort { header "__algorithm/stable_sort.h" } + module swap_ranges { header "__algorithm/swap_ranges.h" } + module three_way_comp_ref_type { header "__algorithm/three_way_comp_ref_type.h" } + module transform { header "__algorithm/transform.h" } + module uniform_random_bit_generator_adaptor { header "__algorithm/uniform_random_bit_generator_adaptor.h" } + module unique_copy { header "__algorithm/unique_copy.h" } + module unique { header "__algorithm/unique.h" } + module unwrap_iter { header "__algorithm/unwrap_iter.h" } + module unwrap_range { header "__algorithm/unwrap_range.h" } + module upper_bound { header "__algorithm/upper_bound.h" } + + header "algorithm" + export * + } // module algorithm + + module any { + header "any" + export * + } + + module array { + module fwd { header "__fwd/array.h" } + + header "array" + export * + } + + module atomic { + module aliases { header "__atomic/aliases.h" } + module atomic_base { header "__atomic/atomic_base.h" } + module atomic_flag { header "__atomic/atomic_flag.h" } + module atomic_init { header "__atomic/atomic_init.h" } + module atomic_lock_free { header "__atomic/atomic_lock_free.h" } + module atomic_ref { header "__atomic/atomic_ref.h" } + module atomic_sync { header "__atomic/atomic_sync.h" } + module atomic { + header "__atomic/atomic.h" + export std.atomic.atomic_base // most of std::atomic methods are defined there + } + module check_memory_order { header "__atomic/check_memory_order.h" } + module contention_t { header "__atomic/contention_t.h" } + module cxx_atomic_impl { header "__atomic/cxx_atomic_impl.h" } + module fence { header "__atomic/fence.h" } + module is_always_lock_free { header "__atomic/is_always_lock_free.h" } + module kill_dependency { header "__atomic/kill_dependency.h" } + module memory_order { header "__atomic/memory_order.h" } + module to_gcc_order { header "__atomic/to_gcc_order.h" } + + header "atomic" + export * + } + + module barrier { + header "barrier" + export * + } + + module bit { + module bit_cast { header "__bit/bit_cast.h" } + module bit_ceil { header "__bit/bit_ceil.h" } + module bit_floor { header "__bit/bit_floor.h" } + module bit_log2 { header "__bit/bit_log2.h" } + module bit_width { header "__bit/bit_width.h" } + module blsr { header "__bit/blsr.h" } + module byteswap { header "__bit/byteswap.h" } + module countl { header "__bit/countl.h" } + module countr { header "__bit/countr.h" } + module endian { header "__bit/endian.h" } + module has_single_bit { header "__bit/has_single_bit.h" } + module invert_if { header "__bit/invert_if.h" } + module popcount { header "__bit/popcount.h" } + module rotate { header "__bit/rotate.h" } + + header "bit" + export * + } + + module bitset { + header "bitset" + export * + } + + module charconv { + module chars_format { header "__charconv/chars_format.h" } + module from_chars_integral { header "__charconv/from_chars_integral.h" } + module from_chars_result { header "__charconv/from_chars_result.h" } + module tables { header "__charconv/tables.h" } + module to_chars { header "__charconv/to_chars.h" } + module to_chars_base_10 { header "__charconv/to_chars_base_10.h" } + module to_chars_floating_point { header "__charconv/to_chars_floating_point.h" } + module to_chars_integral { header "__charconv/to_chars_integral.h" } + module to_chars_result { header "__charconv/to_chars_result.h" } + module traits { header "__charconv/traits.h" } + + header "charconv" + export * + } + + module chrono { + module calendar { header "__chrono/calendar.h" } + module concepts { header "__chrono/concepts.h" } + module convert_to_timespec { header "__chrono/convert_to_timespec.h" } + module convert_to_tm { header "__chrono/convert_to_tm.h" } + module day { header "__chrono/day.h" } + module duration { header "__chrono/duration.h" } + module exception { header "__chrono/exception.h" } + module file_clock { header "__chrono/file_clock.h" } + module formatter { header "__chrono/formatter.h" } + module hh_mm_ss { header "__chrono/hh_mm_ss.h" } + module high_resolution_clock { + header "__chrono/high_resolution_clock.h" + export * + } + module leap_second { + header "__chrono/leap_second.h" + } + module literals { + header "__chrono/literals.h" + } + module local_info { + header "__chrono/local_info.h" + export std.chrono.sys_info + } + module month_weekday { header "__chrono/month_weekday.h" } + module month { header "__chrono/month.h" } + module monthday { header "__chrono/monthday.h" } + module ostream { header "__chrono/ostream.h" } + module parser_std_format_spec { header "__chrono/parser_std_format_spec.h" } + module statically_widen { header "__chrono/statically_widen.h" } + module steady_clock { + header "__chrono/steady_clock.h" + export std.chrono.time_point + } + module sys_info { + header "__chrono/sys_info.h" + } + module system_clock { + header "__chrono/system_clock.h" + export std.chrono.time_point + } + module time_point { header "__chrono/time_point.h" } + module time_zone_link { header "__chrono/time_zone_link.h" } + module time_zone { header "__chrono/time_zone.h" } + module tzdb_list { + header "__chrono/tzdb_list.h" + export std.forward_list // forward_list iterators are used to implement this API + export std.string_view // by-value argument of type std::string_view + } + module tzdb { + header "__chrono/tzdb.h" + export std.string // public data member of type std::string + export std.vector // public data members of type std::vector + } + module weekday { header "__chrono/weekday.h" } + module year_month_day { header "__chrono/year_month_day.h" } + module year_month_weekday { header "__chrono/year_month_weekday.h" } + module year_month { header "__chrono/year_month.h" } + module year { header "__chrono/year.h" } + module zoned_time { header "__chrono/zoned_time.h" } + + header "chrono" + export * + } // module chrono + + module codecvt { + header "codecvt" + export * + } + + module compare { + module common_comparison_category { header "__compare/common_comparison_category.h" } + module compare_partial_order_fallback { header "__compare/compare_partial_order_fallback.h" } + module compare_strong_order_fallback { header "__compare/compare_strong_order_fallback.h" } + module compare_three_way { header "__compare/compare_three_way.h" } + module compare_three_way_result { header "__compare/compare_three_way_result.h" } + module compare_weak_order_fallback { header "__compare/compare_weak_order_fallback.h" } + module is_eq { header "__compare/is_eq.h" } + module ordering { header "__compare/ordering.h" } + module partial_order { header "__compare/partial_order.h" } + module strong_order { header "__compare/strong_order.h" } + module synth_three_way { header "__compare/synth_three_way.h" } + module three_way_comparable { header "__compare/three_way_comparable.h" } + module weak_order { header "__compare/weak_order.h" } + + header "compare" + export * + } + + module complex { + module fwd { header "__fwd/complex.h" } + + header "complex" + export * + } + + module concepts { + module arithmetic { header "__concepts/arithmetic.h" } + module assignable { header "__concepts/assignable.h" } + module boolean_testable { header "__concepts/boolean_testable.h" } + module class_or_enum { header "__concepts/class_or_enum.h" } + module common_reference_with { header "__concepts/common_reference_with.h" } + module common_with { header "__concepts/common_with.h" } + module constructible { header "__concepts/constructible.h" } + module convertible_to { header "__concepts/convertible_to.h" } + module copyable { header "__concepts/copyable.h" } + module derived_from { header "__concepts/derived_from.h" } + module destructible { header "__concepts/destructible.h" } + module different_from { header "__concepts/different_from.h" } + module equality_comparable { header "__concepts/equality_comparable.h" } + module invocable { header "__concepts/invocable.h" } + module movable { header "__concepts/movable.h" } + module predicate { header "__concepts/predicate.h" } + module regular { header "__concepts/regular.h" } + module relation { header "__concepts/relation.h" } + module same_as { header "__concepts/same_as.h" } + module semiregular { header "__concepts/semiregular.h" } + module swappable { header "__concepts/swappable.h" } + module totally_ordered { header "__concepts/totally_ordered.h" } + + header "concepts" + export * + } + + module condition_variable { + module condition_variable { header "__condition_variable/condition_variable.h" } + + header "condition_variable" + export * + } + + module cassert { + textual header "cassert" // NDEBUG requires textual inclusion + } + + module ccomplex { + header "ccomplex" + export * + } + + module cctype { + header "cctype" + export * + } + + module cerrno { + header "cerrno" + export * + } + + module cfenv { + header "cfenv" + export * + } + + module cfloat { + header "cfloat" + export * + } + + module cinttypes { + header "cinttypes" + export * + } + + module ciso646 { + header "ciso646" + export * + } + + module climits { + header "climits" + export * + } + + module clocale { + header "clocale" + export * + } + + module cmath { + header "cmath" + export * + } + + // TODO: Make non-textual. This seems to cause problems when compiling against Glibc. + module csetjmp { + textual header "csetjmp" + } + + module csignal { + header "csignal" + export * + } + + module cstdarg { + header "cstdarg" + export * + } + + module cstdbool { + header "cstdbool" + export * + } + + module cstddef { + header "cstddef" + export * + } + + module cstdio { + header "cstdio" + export * + } + + module cstdlib { + header "cstdlib" + export * + } + + module cstring { + header "cstring" + export * + } + + module ctgmath { + header "ctgmath" + export * + } + + module ctime { + header "ctime" + export * + } + + module cuchar { + header "cuchar" + export * + } + + module cwchar { + header "cwchar" + export * + } + + module cwctype { + header "cwctype" + export * + } + + module deque { + module fwd { header "__fwd/deque.h" } + + header "deque" + export * + } + + module exception { + module exception { header "__exception/exception.h" } + module exception_ptr { header "__exception/exception_ptr.h" } + module nested_exception { header "__exception/nested_exception.h" } + module operations { header "__exception/operations.h" } + module terminate { header "__exception/terminate.h" } + + header "exception" + export * + } + + module execution { + header "execution" + export * + } + + module expected { + module bad_expected_access { header "__expected/bad_expected_access.h" } + module expected { header "__expected/expected.h" } + module unexpect { header "__expected/unexpect.h" } + module unexpected { header "__expected/unexpected.h" } + + header "expected" + export * + } + + module filesystem { + module copy_options { header "__filesystem/copy_options.h" } + module directory_entry { header "__filesystem/directory_entry.h" } + module directory_iterator { header "__filesystem/directory_iterator.h" } + module directory_options { header "__filesystem/directory_options.h" } + module file_status { header "__filesystem/file_status.h" } + module file_time_type { header "__filesystem/file_time_type.h" } + module file_type { header "__filesystem/file_type.h" } + module filesystem_error { header "__filesystem/filesystem_error.h" } + module operations { header "__filesystem/operations.h" } + module path_iterator { header "__filesystem/path_iterator.h" } + module path { + header "__filesystem/path.h" + export std.string // returned by various methods of filesystem::path + } + module perm_options { header "__filesystem/perm_options.h" } + module perms { header "__filesystem/perms.h" } + module recursive_directory_iterator { header "__filesystem/recursive_directory_iterator.h" } + module space_info { header "__filesystem/space_info.h" } + module u8path { header "__filesystem/u8path.h" } + + header "filesystem" + export * + } + + module format { + module buffer { header "__format/buffer.h" } + module concepts { header "__format/concepts.h" } + module container_adaptor { header "__format/container_adaptor.h" } + module enable_insertable { header "__format/enable_insertable.h" } + module escaped_output_table { header "__format/escaped_output_table.h" } + module extended_grapheme_cluster_table { header "__format/extended_grapheme_cluster_table.h" } + module format_arg { header "__format/format_arg.h" } + module format_arg_store { header "__format/format_arg_store.h" } + module format_args { header "__format/format_args.h" } + module format_context { + header "__format/format_context.h" + export std.optional // default argument for __format_context_create + } + module format_error { + header "__format/format_error.h" + } + module format_functions { + header "__format/format_functions.h" + export std.string // returned by the functions in that header + } + module format_parse_context { header "__format/format_parse_context.h" } + module format_string { header "__format/format_string.h" } + module format_to_n_result { header "__format/format_to_n_result.h" } + module formatter { header "__format/formatter.h" } + module formatter_bool { header "__format/formatter_bool.h" } + module formatter_char { header "__format/formatter_char.h" } + module formatter_floating_point { header "__format/formatter_floating_point.h" } + module formatter_integer { header "__format/formatter_integer.h" } + module formatter_integral { header "__format/formatter_integral.h" } + module formatter_output { header "__format/formatter_output.h" } + module formatter_pointer { header "__format/formatter_pointer.h" } + module formatter_string { header "__format/formatter_string.h" } + module formatter_tuple { header "__format/formatter_tuple.h" } + module fwd { header "__fwd/format.h" } + module indic_conjunct_break_table { header "__format/indic_conjunct_break_table.h" } + module parser_std_format_spec { header "__format/parser_std_format_spec.h" } + module range_default_formatter { header "__format/range_default_formatter.h" } + module range_formatter { header "__format/range_formatter.h" } + module unicode { header "__format/unicode.h" } + module width_estimation_table { header "__format/width_estimation_table.h" } + module write_escaped { header "__format/write_escaped.h" } + + header "format" + export * + } // module format + + module forward_list { + header "forward_list" + export * + } + + module fstream { + module fwd { header "__fwd/fstream.h" } + + header "fstream" + export * + } + + module functional { + module binary_function { header "__functional/binary_function.h" } + module binary_negate { header "__functional/binary_negate.h" } + module bind_back { + header "__functional/bind_back.h" + export std.functional.perfect_forward // inherited from and using its operators + } + module bind_front { + header "__functional/bind_front.h" + export std.functional.perfect_forward // inherited from and using its operators + } + module bind { header "__functional/bind.h" } + module binder1st { header "__functional/binder1st.h" } + module binder2nd { header "__functional/binder2nd.h" } + module boyer_moore_searcher { + header "__functional/boyer_moore_searcher.h" + export std.memory.shared_ptr + } + module compose { + header "__functional/compose.h" + export std.functional.perfect_forward // inherited from and using its operators + } + module default_searcher { header "__functional/default_searcher.h" } + module function { header "__functional/function.h" } + module hash { header "__functional/hash.h" } + module identity { header "__functional/identity.h" } + module invoke { header "__functional/invoke.h" } + module is_transparent { header "__functional/is_transparent.h" } + module mem_fn { header "__functional/mem_fn.h" } + module mem_fun_ref { header "__functional/mem_fun_ref.h" } + module not_fn { + header "__functional/not_fn.h" + export std.functional.perfect_forward // inherited from and using its operators + } + module operations { header "__functional/operations.h" } + module perfect_forward { + header "__functional/perfect_forward.h" + export std.tuple + } + module pointer_to_binary_function { header "__functional/pointer_to_binary_function.h" } + module pointer_to_unary_function { header "__functional/pointer_to_unary_function.h" } + module ranges_operations { header "__functional/ranges_operations.h" } + module reference_wrapper { header "__functional/reference_wrapper.h" } + module unary_function { header "__functional/unary_function.h" } + module unary_negate { header "__functional/unary_negate.h" } + module weak_result_type { header "__functional/weak_result_type.h" } + + header "functional" + export * + } // module functional + + module future { + header "future" + export * + } + + module initializer_list { + header "initializer_list" + export * + } + + module iomanip { + header "iomanip" + export * + } + + module ios { + module fwd { header "__fwd/ios.h" } + module fpos { header "__ios/fpos.h" } + + header "ios" + export * + } + + module iosfwd { + header "iosfwd" + export * + } + + module iostream { + header "iostream" + export * + } + + module istream { + module fwd { header "__fwd/istream.h" } + + header "istream" + export std.ios // base class + } -// Experimental C++ standard library interfaces -module std_experimental [system] { module iterator { - header "experimental/iterator" + module access { header "__iterator/access.h" } + module advance { header "__iterator/advance.h" } + module aliasing_iterator { header "__iterator/aliasing_iterator.h" } + module back_insert_iterator { header "__iterator/back_insert_iterator.h" } + module bounded_iter { header "__iterator/bounded_iter.h" } + module common_iterator { header "__iterator/common_iterator.h" } + module concepts { + header "__iterator/concepts.h" + export std_core.type_traits.common_reference + } + module counted_iterator { header "__iterator/counted_iterator.h" } + module cpp17_iterator_concepts { header "__iterator/cpp17_iterator_concepts.h" } + module data { header "__iterator/data.h" } + module default_sentinel { header "__iterator/default_sentinel.h" } + module distance { header "__iterator/distance.h" } + module empty { header "__iterator/empty.h" } + module erase_if_container { header "__iterator/erase_if_container.h" } + module front_insert_iterator { header "__iterator/front_insert_iterator.h" } + module incrementable_traits { header "__iterator/incrementable_traits.h" } + module indirectly_comparable { header "__iterator/indirectly_comparable.h" } + module insert_iterator { header "__iterator/insert_iterator.h" } + module istream_iterator { header "__iterator/istream_iterator.h" } + module istreambuf_iterator { header "__iterator/istreambuf_iterator.h" } + module iter_move { header "__iterator/iter_move.h" } + module iter_swap { header "__iterator/iter_swap.h" } + module iterator_traits { + header "__iterator/iterator_traits.h" + export std_core.type_traits.integral_constant + } + module iterator_with_data { header "__iterator/iterator_with_data.h" } + module iterator { header "__iterator/iterator.h" } + module mergeable { header "__iterator/mergeable.h" } + module move_iterator { header "__iterator/move_iterator.h" } + module move_sentinel { header "__iterator/move_sentinel.h" } + module next { header "__iterator/next.h" } + module ostream_iterator { header "__iterator/ostream_iterator.h" } + module ostreambuf_iterator { + header "__iterator/ostreambuf_iterator.h" + export iosfwd // for default template argument of ostreambuf_iterator + } + module permutable { header "__iterator/permutable.h" } + module prev { header "__iterator/prev.h" } + module projected { header "__iterator/projected.h" } + module ranges_iterator_traits { header "__iterator/ranges_iterator_traits.h" } + module readable_traits { header "__iterator/readable_traits.h" } + module reverse_access { header "__iterator/reverse_access.h" } + module reverse_iterator { header "__iterator/reverse_iterator.h" } + module segmented_iterator { header "__iterator/segmented_iterator.h" } + module size { header "__iterator/size.h" } + module sortable { header "__iterator/sortable.h" } + module unreachable_sentinel { header "__iterator/unreachable_sentinel.h" } + module wrap_iter { header "__iterator/wrap_iter.h" } + + header "iterator" + export * + } + + module latch { + header "latch" + export * + } + + module list { + header "list" + export * + } + + module locale { + header "locale" + header "__locale_dir/locale_base_api.h" + header "__locale_dir/locale_base_api/locale_guard.h" + module locale_base_api { + textual header "__locale_dir/locale_base_api/android.h" + textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h" + textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" + textual header "__locale_dir/locale_base_api/fuchsia.h" + textual header "__locale_dir/locale_base_api/ibm.h" + textual header "__locale_dir/locale_base_api/musl.h" + textual header "__locale_dir/locale_base_api/newlib.h" + textual header "__locale_dir/locale_base_api/openbsd.h" + textual header "__locale_dir/locale_base_api/win32.h" + } + export * + } + + // TODO: Understand why this needs to live in its own module + module locale_base [system] { + header "__locale" + export * + } + + module map { + header "map" + export * + } + + module mdspan { + module default_accessor { header "__mdspan/default_accessor.h" } + module extents { header "__mdspan/extents.h" } + module fwd { header "__fwd/mdspan.h" } + module layout_left { header "__mdspan/layout_left.h" } + module layout_right { header "__mdspan/layout_right.h" } + module layout_stride { header "__mdspan/layout_stride.h" } + module mdspan { + header "__mdspan/mdspan.h" + export std.array // returned by some methods + } + + header "mdspan" export * } + module memory { - header "experimental/memory" + module addressof { header "__memory/addressof.h" } + module align { header "__memory/align.h" } + module aligned_alloc { header "__memory/aligned_alloc.h" } + module allocate_at_least { header "__memory/allocate_at_least.h" } + module allocation_guard { header "__memory/allocation_guard.h" } + module allocator { header "__memory/allocator.h" } + module allocator_arg_t { header "__memory/allocator_arg_t.h" } + module allocator_destructor { header "__memory/allocator_destructor.h" } + module allocator_traits { header "__memory/allocator_traits.h" } + module array_cookie { header "__memory/array_cookie.h" } + module assume_aligned { header "__memory/assume_aligned.h" } + module auto_ptr { header "__memory/auto_ptr.h" } + module builtin_new_allocator { header "__memory/builtin_new_allocator.h" } + module compressed_pair { header "__memory/compressed_pair.h" } + module concepts { header "__memory/concepts.h" } + module construct_at { header "__memory/construct_at.h" } + module destruct_n { header "__memory/destruct_n.h" } + module fwd { header "__fwd/memory.h" } + module inout_ptr { header "__memory/inout_ptr.h" } + module noexcept_move_assign_container { header "__memory/noexcept_move_assign_container.h" } + module out_ptr { header "__memory/out_ptr.h" } + module pointer_traits { header "__memory/pointer_traits.h" } + module ranges_construct_at { header "__memory/ranges_construct_at.h" } + module ranges_uninitialized_algorithms { + header "__memory/ranges_uninitialized_algorithms.h" + export std.algorithm.in_out_result + } + module raw_storage_iterator { header "__memory/raw_storage_iterator.h" } + module shared_ptr { header "__memory/shared_ptr.h" } + module swap_allocator { header "__memory/swap_allocator.h" } + module temp_value { header "__memory/temp_value.h" } + module temporary_buffer { + header "__memory/temporary_buffer.h" + export std.utility.pair // return type of std::get_temporary_buffer() + } + module uninitialized_algorithms { + header "__memory/uninitialized_algorithms.h" + } + module unique_ptr { + header "__memory/unique_ptr.h" + } + module unique_temporary_buffer { + header "__memory/unique_temporary_buffer.h" + export std.memory.unique_ptr + export std_core.type_traits.is_constant_evaluated + } + module uses_allocator { header "__memory/uses_allocator.h" } + module uses_allocator_construction { header "__memory/uses_allocator_construction.h" } + + header "memory" export * } - module propagate_const { - header "experimental/propagate_const" + + module memory_resource { + module fwd { header "__fwd/memory_resource.h" } + module memory_resource { header "__memory_resource/memory_resource.h" } + module monotonic_buffer_resource { header "__memory_resource/monotonic_buffer_resource.h" } + module polymorphic_allocator { header "__memory_resource/polymorphic_allocator.h" } + module pool_options { header "__memory_resource/pool_options.h" } + module synchronized_pool_resource { header "__memory_resource/synchronized_pool_resource.h" } + module unsynchronized_pool_resource { header "__memory_resource/unsynchronized_pool_resource.h" } + + header "memory_resource" export * } - module simd { - module aligned_tag { private header "experimental/__simd/aligned_tag.h" } - module declaration { private header "experimental/__simd/declaration.h" } - module reference { private header "experimental/__simd/reference.h" } - module scalar { private header "experimental/__simd/scalar.h" } - module simd { private header "experimental/__simd/simd.h" } - module simd_mask { private header "experimental/__simd/simd_mask.h" } - module traits { private header "experimental/__simd/traits.h" } - module utility { private header "experimental/__simd/utility.h" } - module vec_ext { private header "experimental/__simd/vec_ext.h" } - header "experimental/simd" + module mutex { + module lock_guard { header "__mutex/lock_guard.h" } + module mutex { header "__mutex/mutex.h" } + module once_flag { header "__mutex/once_flag.h" } + module tag_types { header "__mutex/tag_types.h" } + module unique_lock { header "__mutex/unique_lock.h" } + + header "mutex" export * } - module type_traits { - header "experimental/type_traits" + + module new { + header "new" export * } - module utility { - header "experimental/utility" + + module numbers { + header "numbers" export * } -} -// Convenience method to get all of the above modules in a single import statement. -// Importing only the needed modules is likely to be more performant. -module std [system] { - header "__std_clang_module" - export * -} + module numeric { + module accumulate { header "__numeric/accumulate.h" } + module adjacent_difference { header "__numeric/adjacent_difference.h" } + module exclusive_scan { header "__numeric/exclusive_scan.h" } + module gcd_lcm { header "__numeric/gcd_lcm.h" } + module inclusive_scan { header "__numeric/inclusive_scan.h" } + module inner_product { header "__numeric/inner_product.h" } + module iota { header "__numeric/iota.h" } + module midpoint { header "__numeric/midpoint.h" } + module partial_sum { header "__numeric/partial_sum.h" } + module pstl { header "__numeric/pstl.h" } + module reduce { header "__numeric/reduce.h" } + module saturation_arithmetic { header "__numeric/saturation_arithmetic.h" } + module transform_exclusive_scan { header "__numeric/transform_exclusive_scan.h" } + module transform_inclusive_scan { header "__numeric/transform_inclusive_scan.h" } + module transform_reduce { header "__numeric/transform_reduce.h" } -// Implementation detail headers that are private to libc++. These modules -// must not be directly imported. -module std_private_assert [system] { - header "__assert" - export * -} -module std_private_bit_reference [system] { - header "__bit_reference" - export * -} -module std_private_fwd_bit_reference [system] { - header "__fwd/bit_reference.h" -} -module std_private_fwd_byte [system] { - header "__fwd/byte.h" -} -module std_private_config [system] { - textual header "__config" - textual header "__configuration/abi.h" - textual header "__configuration/availability.h" - textual header "__configuration/compiler.h" - textual header "__configuration/language.h" - textual header "__configuration/platform.h" - export * -} -module std_private_hash_table [system] { - header "__hash_table" - export * -} -module std_private_locale [system] { - header "__locale" - export * -} -module std_private_mbstate_t [system] { - header "__mbstate_t.h" - export * -} -module std_private_node_handle [system] { - header "__node_handle" - export * -} -module std_private_split_buffer [system] { - header "__split_buffer" - export * -} -module std_private_std_mbstate_t [system] { - header "__std_mbstate_t.h" - export * -} -module std_private_tree [system] { - header "__tree" - export * -} -module std_private_undef_macros [system] { - textual header "__undef_macros" - export * -} -module std_private_verbose_abort [system] { - header "__verbose_abort" - export * -} + header "numeric" + export * + } + + module optional { + header "optional" + export * + } + + module ostream { + module basic_ostream { + header "__ostream/basic_ostream.h" + export std.ios // base class + } + module fwd { + header "__fwd/ostream.h" + } + module print { + header "__ostream/print.h" + export * + } + + header "ostream" + export * + } + + module print { + header "print" + export * + } + + module queue { + module fwd { header "__fwd/queue.h" } + + header "queue" + export * + } + + module random { + module bernoulli_distribution { header "__random/bernoulli_distribution.h" } + module binomial_distribution { header "__random/binomial_distribution.h" } + module cauchy_distribution { header "__random/cauchy_distribution.h" } + module chi_squared_distribution { header "__random/chi_squared_distribution.h" } + module clamp_to_integral { header "__random/clamp_to_integral.h" } + module default_random_engine { header "__random/default_random_engine.h" } + module discard_block_engine { header "__random/discard_block_engine.h" } + module discrete_distribution { header "__random/discrete_distribution.h" } + module exponential_distribution { header "__random/exponential_distribution.h" } + module extreme_value_distribution { header "__random/extreme_value_distribution.h" } + module fisher_f_distribution { header "__random/fisher_f_distribution.h" } + module gamma_distribution { header "__random/gamma_distribution.h" } + module generate_canonical { header "__random/generate_canonical.h" } + module geometric_distribution { header "__random/geometric_distribution.h" } + module independent_bits_engine { header "__random/independent_bits_engine.h" } + module is_seed_sequence { header "__random/is_seed_sequence.h" } + module is_valid { + header "__random/is_valid.h" + export std_core.type_traits.integral_constant + } + module knuth_b { header "__random/knuth_b.h" } + module linear_congruential_engine { header "__random/linear_congruential_engine.h" } + module log2 { header "__random/log2.h" } + module lognormal_distribution { header "__random/lognormal_distribution.h" } + module mersenne_twister_engine { header "__random/mersenne_twister_engine.h" } + module negative_binomial_distribution { header "__random/negative_binomial_distribution.h" } + module normal_distribution { header "__random/normal_distribution.h" } + module piecewise_constant_distribution { header "__random/piecewise_constant_distribution.h" } + module piecewise_linear_distribution { header "__random/piecewise_linear_distribution.h" } + module poisson_distribution { header "__random/poisson_distribution.h" } + module random_device { header "__random/random_device.h" } + module ranlux { header "__random/ranlux.h" } + module seed_seq { header "__random/seed_seq.h" } + module shuffle_order_engine { header "__random/shuffle_order_engine.h" } + module student_t_distribution { header "__random/student_t_distribution.h" } + module subtract_with_carry_engine { header "__random/subtract_with_carry_engine.h" } + module uniform_int_distribution { header "__random/uniform_int_distribution.h" } + module uniform_random_bit_generator { header "__random/uniform_random_bit_generator.h" } + module uniform_real_distribution { header "__random/uniform_real_distribution.h" } + module weibull_distribution { header "__random/weibull_distribution.h" } + + header "random" + export * + } + + module ranges { + module access { header "__ranges/access.h" } + module all { header "__ranges/all.h" } + module as_rvalue_view { header "__ranges/as_rvalue_view.h" } + module chunk_by_view { + header "__ranges/chunk_by_view.h" + export std.functional.bind_back + } + module common_view { header "__ranges/common_view.h" } + module concepts { header "__ranges/concepts.h" } + module container_compatible_range { header "__ranges/container_compatible_range.h" } + module counted { + header "__ranges/counted.h" + export std.span // return type of views::counted + export std.ranges.subrange // return type of views::counted + } + module dangling { + header "__ranges/dangling.h" + } + module data { + header "__ranges/data.h" + } + module drop_view { + header "__ranges/drop_view.h" + export std.functional.bind_back + } + module drop_while_view { + header "__ranges/drop_while_view.h" + export std.functional.bind_back + } + module elements_view { header "__ranges/elements_view.h" } + module empty { header "__ranges/empty.h" } + module empty_view { header "__ranges/empty_view.h" } + module enable_borrowed_range { header "__ranges/enable_borrowed_range.h" } + module enable_view { header "__ranges/enable_view.h" } + module filter_view { + header "__ranges/filter_view.h" + export std.functional.bind_back + } + module from_range { header "__ranges/from_range.h" } + module iota_view { header "__ranges/iota_view.h" } + module istream_view { header "__ranges/istream_view.h" } + module join_view { header "__ranges/join_view.h" } + module lazy_split_view { + header "__ranges/lazy_split_view.h" + export std.functional.bind_back + } + module movable_box { header "__ranges/movable_box.h" } + module non_propagating_cache { header "__ranges/non_propagating_cache.h" } + module owning_view { header "__ranges/owning_view.h" } + module range_adaptor { header "__ranges/range_adaptor.h" } + module rbegin { header "__ranges/rbegin.h" } + module ref_view { header "__ranges/ref_view.h" } + module rend { header "__ranges/rend.h" } + module repeat_view { header "__ranges/repeat_view.h" } + module reverse_view { header "__ranges/reverse_view.h" } + module single_view { header "__ranges/single_view.h" } + module size { header "__ranges/size.h" } + module split_view { + header "__ranges/split_view.h" + export std.functional.bind_back + } + module subrange { + header "__ranges/subrange.h" + export std.ranges.subrange_fwd + } + module subrange_fwd { + header "__fwd/subrange.h" + } + module take_view { + header "__ranges/take_view.h" + export std.functional.bind_back + } + module take_while_view { + header "__ranges/take_while_view.h" + export std.functional.bind_back + } + module to { + header "__ranges/to.h" + export std.functional.bind_back + } + module transform_view { + header "__ranges/transform_view.h" + export std.functional.bind_back + } + module view_interface { + header "__ranges/view_interface.h" + } + module views { + header "__ranges/views.h" + } + module zip_view { + header "__ranges/zip_view.h" + export std.utility.pair + } + + header "ranges" + export * + } // module ranges + + module ratio { + header "ratio" + export * + } + + module regex { + header "regex" + export * + } + + module scoped_allocator { + header "scoped_allocator" + export * + } + + module semaphore { + header "semaphore" + export * + } + + module set { + header "set" + export * + } + + module shared_mutex { + header "shared_mutex" + export * + } + + module source_location { + header "source_location" + export * + } + + module span { + module fwd { header "__fwd/span.h" } + + header "span" + export * + } + + module sstream { + module fwd { header "__fwd/sstream.h" } + + header "sstream" + export * + } + + module stack { + module fwd { header "__fwd/stack.h" } + + header "stack" + export * + } + + module stdexcept { + header "stdexcept" + export * + } + + module stop_token { + module atomic_unique_lock { header "__stop_token/atomic_unique_lock.h" } + module intrusive_list_view { header "__stop_token/intrusive_list_view.h" } + module intrusive_shared_ptr { header "__stop_token/intrusive_shared_ptr.h" } + module stop_callback { header "__stop_token/stop_callback.h" } + module stop_source { header "__stop_token/stop_source.h" } + module stop_state { header "__stop_token/stop_state.h" } + module stop_token { header "__stop_token/stop_token.h" } + + header "stop_token" + export * + } + + module streambuf { + module fwd { header "__fwd/streambuf.h" } + + header "streambuf" + export * + } + + module string { + module char_traits { header "__string/char_traits.h" } + module constexpr_c_functions { header "__string/constexpr_c_functions.h" } + module extern_template_lists { header "__string/extern_template_lists.h" } + module fwd { header "__fwd/string.h" } + + header "string" + export * + } + + module string_view { + module fwd { header "__fwd/string_view.h" } + + header "string_view" + export * + } + + module strstream { + header "strstream" + export * + } + + module syncstream { + header "syncstream" + export * + } + + module system_error { + module errc { header "__system_error/errc.h" } + module error_category { header "__system_error/error_category.h" } + module error_code { + header "__system_error/error_code.h" + export std.system_error.error_category // methods of error_code return that type + } + module error_condition { header "__system_error/error_condition.h" } + module system_error { header "__system_error/system_error.h" } + + header "system_error" + export * + } + + module thread { + module formatter { header "__thread/formatter.h" } + module id { header "__thread/id.h" } + module jthread { header "__thread/jthread.h" } + module poll_with_backoff { header "__thread/poll_with_backoff.h" } + module this_thread { header "__thread/this_thread.h" } + module thread { header "__thread/thread.h" } + module timed_backoff_policy { header "__thread/timed_backoff_policy.h" } + + module support { + header "__thread/support.h" + export * + } + module support_impl { + textual header "__thread/support/c11.h" + textual header "__thread/support/external.h" + textual header "__thread/support/pthread.h" + textual header "__thread/support/windows.h" + } + + header "thread" + export * + } + + module tuple { + module find_index { header "__tuple/find_index.h" } + module ignore { header "__tuple/ignore.h" } + module make_tuple_types { header "__tuple/make_tuple_types.h" } + module sfinae_helpers { header "__tuple/sfinae_helpers.h" } + module tuple_element { header "__tuple/tuple_element.h" } + module tuple_indices { header "__tuple/tuple_indices.h" } + module tuple_like_ext { header "__tuple/tuple_like_ext.h" } + module tuple_like_no_subrange { header "__tuple/tuple_like_no_subrange.h" } + module tuple_like { header "__tuple/tuple_like.h" } + module tuple_size { header "__tuple/tuple_size.h" } + module tuple_types { header "__tuple/tuple_types.h" } + + header "tuple" + export * + } + + module typeindex { + header "typeindex" + export * + } + + module typeinfo { + header "typeinfo" + export * + } + + module unordered_map { + header "unordered_map" + export * + } + + module unordered_set { + header "unordered_set" + export * + } + + module utility { + module as_const { header "__utility/as_const.h" } + module as_lvalue { header "__utility/as_lvalue.h" } + module auto_cast { + header "__utility/auto_cast.h" + export std_core.type_traits.decay // the macro expansion uses that trait + } + module cmp { header "__utility/cmp.h" } + module convert_to_integral { header "__utility/convert_to_integral.h" } + module exception_guard { header "__utility/exception_guard.h" } + module exchange { header "__utility/exchange.h" } + module forward_like { header "__utility/forward_like.h" } + module in_place { + header "__utility/in_place.h" + export std_core.type_traits.integral_constant + } + module integer_sequence { header "__utility/integer_sequence.h" } + module is_pointer_in_range { header "__utility/is_pointer_in_range.h" } + module is_valid_range { header "__utility/is_valid_range.h" } + module move { header "__utility/move.h" } + module no_destroy { header "__utility/no_destroy.h" } + module pair { header "__utility/pair.h" } + module piecewise_construct { header "__utility/piecewise_construct.h" } + module priority_tag { header "__utility/priority_tag.h" } + module private_constructor_tag { header "__utility/private_constructor_tag.h" } + module rel_ops { header "__utility/rel_ops.h" } + module small_buffer { header "__utility/small_buffer.h" } + module swap { header "__utility/swap.h" } + module to_underlying { header "__utility/to_underlying.h" } + module unreachable { header "__utility/unreachable.h" } + + header "utility" + export * + } + + module valarray { + header "valarray" + export * + } -module std_private_algorithm_adjacent_find [system] { header "__algorithm/adjacent_find.h" } -module std_private_algorithm_all_of [system] { header "__algorithm/all_of.h" } -module std_private_algorithm_any_of [system] { header "__algorithm/any_of.h" } -module std_private_algorithm_binary_search [system] { header "__algorithm/binary_search.h" } -module std_private_algorithm_clamp [system] { header "__algorithm/clamp.h" } -module std_private_algorithm_comp [system] { header "__algorithm/comp.h" } -module std_private_algorithm_comp_ref_type [system] { header "__algorithm/comp_ref_type.h" } -module std_private_algorithm_copy [system] { - header "__algorithm/copy.h" - export std_private_algorithm_copy_move_common -} -module std_private_algorithm_copy_backward [system] { header "__algorithm/copy_backward.h" } -module std_private_algorithm_copy_if [system] { header "__algorithm/copy_if.h" } -module std_private_algorithm_copy_move_common [system] { - header "__algorithm/copy_move_common.h" - export std_private_type_traits_is_trivially_copyable -} -module std_private_algorithm_copy_n [system] { header "__algorithm/copy_n.h" } -module std_private_algorithm_count [system] { header "__algorithm/count.h" } -module std_private_algorithm_count_if [system] { header "__algorithm/count_if.h" } -module std_private_algorithm_equal [system] { header "__algorithm/equal.h" } -module std_private_algorithm_equal_range [system] { header "__algorithm/equal_range.h" } -module std_private_algorithm_fill [system] { header "__algorithm/fill.h" } -module std_private_algorithm_fill_n [system] { header "__algorithm/fill_n.h" } -module std_private_algorithm_find [system] { - header "__algorithm/find.h" - export std_private_algorithm_unwrap_iter -} -module std_private_algorithm_find_end [system] { header "__algorithm/find_end.h" } -module std_private_algorithm_find_first_of [system] { header "__algorithm/find_first_of.h" } -module std_private_algorithm_find_if [system] { header "__algorithm/find_if.h" } -module std_private_algorithm_find_if_not [system] { header "__algorithm/find_if_not.h" } -module std_private_algorithm_find_segment_if [system] { header "__algorithm/find_segment_if.h" } -module std_private_algorithm_for_each [system] { header "__algorithm/for_each.h" } -module std_private_algorithm_for_each_n [system] { header "__algorithm/for_each_n.h" } -module std_private_algorithm_for_each_segment [system] { header "__algorithm/for_each_segment.h" } -module std_private_algorithm_generate [system] { header "__algorithm/generate.h" } -module std_private_algorithm_generate_n [system] { header "__algorithm/generate_n.h" } -module std_private_algorithm_half_positive [system] { header "__algorithm/half_positive.h" } -module std_private_algorithm_in_found_result [system] { header "__algorithm/in_found_result.h" } -module std_private_algorithm_in_fun_result [system] { header "__algorithm/in_fun_result.h" } -module std_private_algorithm_in_in_out_result [system] { header "__algorithm/in_in_out_result.h" } -module std_private_algorithm_in_in_result [system] { header "__algorithm/in_in_result.h" } -module std_private_algorithm_in_out_out_result [system] { header "__algorithm/in_out_out_result.h" } -module std_private_algorithm_in_out_result [system] { header "__algorithm/in_out_result.h" } -module std_private_algorithm_includes [system] { header "__algorithm/includes.h" } -module std_private_algorithm_inplace_merge [system] { header "__algorithm/inplace_merge.h" } -module std_private_algorithm_is_heap [system] { header "__algorithm/is_heap.h" } -module std_private_algorithm_is_heap_until [system] { header "__algorithm/is_heap_until.h" } -module std_private_algorithm_is_partitioned [system] { header "__algorithm/is_partitioned.h" } -module std_private_algorithm_is_permutation [system] { header "__algorithm/is_permutation.h" } -module std_private_algorithm_is_sorted [system] { header "__algorithm/is_sorted.h" } -module std_private_algorithm_is_sorted_until [system] { header "__algorithm/is_sorted_until.h" } -module std_private_algorithm_iter_swap [system] { header "__algorithm/iter_swap.h" } -module std_private_algorithm_iterator_operations [system] { - header "__algorithm/iterator_operations.h" - export * -} -module std_private_algorithm_lexicographical_compare [system] { header "__algorithm/lexicographical_compare.h" } -module std_private_algorithm_lexicographical_compare_three_way [system] { header "__algorithm/lexicographical_compare_three_way.h" } -module std_private_algorithm_lower_bound [system] { header "__algorithm/lower_bound.h" } -module std_private_algorithm_make_heap [system] { header "__algorithm/make_heap.h" } -module std_private_algorithm_make_projected [system] { header "__algorithm/make_projected.h" } -module std_private_algorithm_max [system] { header "__algorithm/max.h" } -module std_private_algorithm_max_element [system] { header "__algorithm/max_element.h" } -module std_private_algorithm_merge [system] { header "__algorithm/merge.h" } -module std_private_algorithm_min [system] { header "__algorithm/min.h" } -module std_private_algorithm_min_element [system] { header "__algorithm/min_element.h" } -module std_private_algorithm_min_max_result [system] { header "__algorithm/min_max_result.h" } -module std_private_algorithm_minmax [system] { - header "__algorithm/minmax.h" - export * -} -module std_private_algorithm_minmax_element [system] { header "__algorithm/minmax_element.h" } -module std_private_algorithm_mismatch [system] { - header "__algorithm/mismatch.h" - export std_private_algorithm_simd_utils - export std_private_iterator_aliasing_iterator -} -module std_private_algorithm_move [system] { header "__algorithm/move.h" } -module std_private_algorithm_move_backward [system] { header "__algorithm/move_backward.h" } -module std_private_algorithm_next_permutation [system] { header "__algorithm/next_permutation.h" } -module std_private_algorithm_none_of [system] { header "__algorithm/none_of.h" } -module std_private_algorithm_nth_element [system] { header "__algorithm/nth_element.h" } -module std_private_algorithm_partial_sort [system] { header "__algorithm/partial_sort.h" } -module std_private_algorithm_partial_sort_copy [system] { header "__algorithm/partial_sort_copy.h" } -module std_private_algorithm_partition [system] { header "__algorithm/partition.h" } -module std_private_algorithm_partition_copy [system] { header "__algorithm/partition_copy.h" } -module std_private_algorithm_partition_point [system] { header "__algorithm/partition_point.h" } -module std_private_algorithm_pop_heap [system] { header "__algorithm/pop_heap.h" } -module std_private_algorithm_prev_permutation [system] { header "__algorithm/prev_permutation.h" } -module std_private_algorithm_pstl [system] { - header "__algorithm/pstl.h" - export * -} -module std_private_algorithm_push_heap [system] { header "__algorithm/push_heap.h" } -module std_private_algorithm_ranges_adjacent_find [system] { header "__algorithm/ranges_adjacent_find.h" } -module std_private_algorithm_ranges_all_of [system] { header "__algorithm/ranges_all_of.h" } -module std_private_algorithm_ranges_any_of [system] { header "__algorithm/ranges_any_of.h" } -module std_private_algorithm_ranges_binary_search [system] { - header "__algorithm/ranges_binary_search.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_clamp [system] { - header "__algorithm/ranges_clamp.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_contains [system] { header "__algorithm/ranges_contains.h" } -module std_private_algorithm_ranges_contains_subrange [system] { header "__algorithm/ranges_contains_subrange.h" } -module std_private_algorithm_ranges_copy [system] { - header "__algorithm/ranges_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_copy_backward [system] { - header "__algorithm/ranges_copy_backward.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_copy_if [system] { - header "__algorithm/ranges_copy_if.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_copy_n [system] { - header "__algorithm/ranges_copy_n.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_count [system] { header "__algorithm/ranges_count.h" } -module std_private_algorithm_ranges_count_if [system] { header "__algorithm/ranges_count_if.h" } -module std_private_algorithm_ranges_ends_with [system] { header "__algorithm/ranges_ends_with.h" } -module std_private_algorithm_ranges_equal [system] { header "__algorithm/ranges_equal.h" } -module std_private_algorithm_ranges_equal_range [system] { - header "__algorithm/ranges_equal_range.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_fill [system] { header "__algorithm/ranges_fill.h" } -module std_private_algorithm_ranges_fill_n [system] { header "__algorithm/ranges_fill_n.h" } -module std_private_algorithm_ranges_find [system] { header "__algorithm/ranges_find.h" } -module std_private_algorithm_ranges_find_end [system] { header "__algorithm/ranges_find_end.h" } -module std_private_algorithm_ranges_find_first_of [system] { header "__algorithm/ranges_find_first_of.h" } -module std_private_algorithm_ranges_find_if [system] { header "__algorithm/ranges_find_if.h" } -module std_private_algorithm_ranges_find_if_not [system] { header "__algorithm/ranges_find_if_not.h" } -module std_private_algorithm_ranges_find_last [system] { header "__algorithm/ranges_find_last.h" } -module std_private_algorithm_ranges_fold [system] { header "__algorithm/ranges_fold.h" } -module std_private_algorithm_ranges_for_each [system] { - header "__algorithm/ranges_for_each.h" - export std_private_algorithm_in_fun_result -} -module std_private_algorithm_ranges_for_each_n [system] { - header "__algorithm/ranges_for_each_n.h" - export std_private_algorithm_in_fun_result -} -module std_private_algorithm_ranges_generate [system] { header "__algorithm/ranges_generate.h" } -module std_private_algorithm_ranges_generate_n [system] { header "__algorithm/ranges_generate_n.h" } -module std_private_algorithm_ranges_includes [system] { - header "__algorithm/ranges_includes.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_inplace_merge [system] { - header "__algorithm/ranges_inplace_merge.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_is_heap [system] { - header "__algorithm/ranges_is_heap.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_is_heap_until [system] { - header "__algorithm/ranges_is_heap_until.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_is_partitioned [system] { header "__algorithm/ranges_is_partitioned.h" } -module std_private_algorithm_ranges_is_permutation [system] { header "__algorithm/ranges_is_permutation.h" } -module std_private_algorithm_ranges_is_sorted [system] { - header "__algorithm/ranges_is_sorted.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_is_sorted_until [system] { - header "__algorithm/ranges_is_sorted_until.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_iterator_concept [system] { header "__algorithm/ranges_iterator_concept.h" } -module std_private_algorithm_ranges_lexicographical_compare [system] { - header "__algorithm/ranges_lexicographical_compare.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_lower_bound [system] { - header "__algorithm/ranges_lower_bound.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_make_heap [system] { - header "__algorithm/ranges_make_heap.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_max [system] { - header "__algorithm/ranges_max.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_max_element [system] { - header "__algorithm/ranges_max_element.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_merge [system] { - header "__algorithm/ranges_merge.h" - export std_private_algorithm_in_in_out_result -} -module std_private_algorithm_ranges_min [system] { - header "__algorithm/ranges_min.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_min_element [system] { - header "__algorithm/ranges_min_element.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_minmax [system] { - header "__algorithm/ranges_minmax.h" - export std_private_functional_ranges_operations - export std_private_algorithm_min_max_result -} -module std_private_algorithm_ranges_minmax_element [system] { - header "__algorithm/ranges_minmax_element.h" - export std_private_functional_ranges_operations - export std_private_algorithm_min_max_result -} -module std_private_algorithm_ranges_mismatch [system] { - header "__algorithm/ranges_mismatch.h" - export std_private_algorithm_in_in_result -} -module std_private_algorithm_ranges_move [system] { - header "__algorithm/ranges_move.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_move_backward [system] { - header "__algorithm/ranges_move_backward.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_next_permutation [system] { - header "__algorithm/ranges_next_permutation.h" - export std_private_algorithm_in_found_result - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_none_of [system] { header "__algorithm/ranges_none_of.h" } -module std_private_algorithm_ranges_nth_element [system] { - header "__algorithm/ranges_nth_element.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_partial_sort [system] { - header "__algorithm/ranges_partial_sort.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_partial_sort_copy [system] { - header "__algorithm/ranges_partial_sort_copy.h" - export std_private_algorithm_in_out_result - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_partition [system] { header "__algorithm/ranges_partition.h" } -module std_private_algorithm_ranges_partition_copy [system] { header "__algorithm/ranges_partition_copy.h" } -module std_private_algorithm_ranges_partition_point [system] { header "__algorithm/ranges_partition_point.h" } -module std_private_algorithm_ranges_pop_heap [system] { - header "__algorithm/ranges_pop_heap.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_prev_permutation [system] { - header "__algorithm/ranges_prev_permutation.h" - export std_private_algorithm_in_found_result - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_push_heap [system] { - header "__algorithm/ranges_push_heap.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_remove [system] { header "__algorithm/ranges_remove.h" } -module std_private_algorithm_ranges_remove_copy [system] { - header "__algorithm/ranges_remove_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_remove_copy_if [system] { - header "__algorithm/ranges_remove_copy_if.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_remove_if [system] { header "__algorithm/ranges_remove_if.h" } -module std_private_algorithm_ranges_replace [system] { header "__algorithm/ranges_replace.h" } -module std_private_algorithm_ranges_replace_copy [system] { - header "__algorithm/ranges_replace_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_replace_copy_if [system] { - header "__algorithm/ranges_replace_copy_if.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_replace_if [system] { header "__algorithm/ranges_replace_if.h" } -module std_private_algorithm_ranges_reverse [system] { header "__algorithm/ranges_reverse.h" } -module std_private_algorithm_ranges_reverse_copy [system] { - header "__algorithm/ranges_reverse_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_rotate [system] { header "__algorithm/ranges_rotate.h" } -module std_private_algorithm_ranges_rotate_copy [system] { - header "__algorithm/ranges_rotate_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_sample [system] { header "__algorithm/ranges_sample.h" } -module std_private_algorithm_ranges_search [system] { header "__algorithm/ranges_search.h" } -module std_private_algorithm_ranges_search_n [system] { header "__algorithm/ranges_search_n.h" } -module std_private_algorithm_ranges_set_difference [system] { - header "__algorithm/ranges_set_difference.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_set_intersection [system] { - header "__algorithm/ranges_set_intersection.h" - export std_private_algorithm_in_in_out_result -} -module std_private_algorithm_ranges_set_symmetric_difference [system] { - header "__algorithm/ranges_set_symmetric_difference.h" - export std_private_algorithm_in_in_out_result - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_set_union [system] { - header "__algorithm/ranges_set_union.h" - export std_private_algorithm_in_in_out_result - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_shuffle [system] { header "__algorithm/ranges_shuffle.h" } -module std_private_algorithm_ranges_sort [system] { - header "__algorithm/ranges_sort.h" - export std_private_algorithm_make_projected - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_sort_heap [system] { - header "__algorithm/ranges_sort_heap.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_stable_partition [system] { header "__algorithm/ranges_stable_partition.h" } -module std_private_algorithm_ranges_stable_sort [system] { - header "__algorithm/ranges_stable_sort.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_ranges_starts_with [system] { header "__algorithm/ranges_starts_with.h" } -module std_private_algorithm_ranges_swap_ranges [system] { - header "__algorithm/ranges_swap_ranges.h" - export std_private_algorithm_in_in_result -} -module std_private_algorithm_ranges_transform [system] { - header "__algorithm/ranges_transform.h" - export std_private_algorithm_in_in_out_result - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_unique [system] { header "__algorithm/ranges_unique.h" } -module std_private_algorithm_ranges_unique_copy [system] { - header "__algorithm/ranges_unique_copy.h" - export std_private_algorithm_in_out_result -} -module std_private_algorithm_ranges_upper_bound [system] { - header "__algorithm/ranges_upper_bound.h" - export std_private_functional_ranges_operations -} -module std_private_algorithm_remove [system] { header "__algorithm/remove.h" } -module std_private_algorithm_remove_copy [system] { header "__algorithm/remove_copy.h" } -module std_private_algorithm_remove_copy_if [system] { header "__algorithm/remove_copy_if.h" } -module std_private_algorithm_remove_if [system] { header "__algorithm/remove_if.h" } -module std_private_algorithm_replace [system] { header "__algorithm/replace.h" } -module std_private_algorithm_replace_copy [system] { header "__algorithm/replace_copy.h" } -module std_private_algorithm_replace_copy_if [system] { header "__algorithm/replace_copy_if.h" } -module std_private_algorithm_replace_if [system] { header "__algorithm/replace_if.h" } -module std_private_algorithm_reverse [system] { header "__algorithm/reverse.h" } -module std_private_algorithm_reverse_copy [system] { header "__algorithm/reverse_copy.h" } -module std_private_algorithm_rotate [system] { header "__algorithm/rotate.h" } -module std_private_algorithm_rotate_copy [system] { header "__algorithm/rotate_copy.h" } -module std_private_algorithm_sample [system] { header "__algorithm/sample.h" } -module std_private_algorithm_search [system] { header "__algorithm/search.h" } -module std_private_algorithm_search_n [system] { header "__algorithm/search_n.h" } -module std_private_algorithm_set_difference [system] { header "__algorithm/set_difference.h" } -module std_private_algorithm_set_intersection [system] { header "__algorithm/set_intersection.h" } -module std_private_algorithm_set_symmetric_difference [system] { header "__algorithm/set_symmetric_difference.h" } -module std_private_algorithm_set_union [system] { header "__algorithm/set_union.h" } -module std_private_algorithm_shift_left [system] { header "__algorithm/shift_left.h" } -module std_private_algorithm_shift_right [system] { header "__algorithm/shift_right.h" } -module std_private_algorithm_shuffle [system] { header "__algorithm/shuffle.h" } -module std_private_algorithm_sift_down [system] { header "__algorithm/sift_down.h" } -module std_private_algorithm_sort [system] { - header "__algorithm/sort.h" - export std_private_debug_utils_strict_weak_ordering_check -} -module std_private_algorithm_simd_utils [system] { header "__algorithm/simd_utils.h" } -module std_private_algorithm_sort_heap [system] { header "__algorithm/sort_heap.h" } -module std_private_algorithm_stable_partition [system] { header "__algorithm/stable_partition.h" } -module std_private_algorithm_stable_sort [system] { header "__algorithm/stable_sort.h" } -module std_private_algorithm_swap_ranges [system] { - header "__algorithm/swap_ranges.h" - export std_private_algorithm_iterator_operations -} -module std_private_algorithm_three_way_comp_ref_type [system] { header "__algorithm/three_way_comp_ref_type.h" } -module std_private_algorithm_transform [system] { header "__algorithm/transform.h" } -module std_private_algorithm_uniform_random_bit_generator_adaptor [system] { header "__algorithm/uniform_random_bit_generator_adaptor.h" } -module std_private_algorithm_unique [system] { header "__algorithm/unique.h" } -module std_private_algorithm_unique_copy [system] { header "__algorithm/unique_copy.h" } -module std_private_algorithm_unwrap_iter [system] { - header "__algorithm/unwrap_iter.h" - export std_private_iterator_iterator_traits -} -module std_private_algorithm_unwrap_range [system] { - header "__algorithm/unwrap_range.h" - export std_private_utility_pair -} -module std_private_algorithm_upper_bound [system] { header "__algorithm/upper_bound.h" } + module variant { + module fwd { header "__fwd/variant.h" } + module monostate { header "__variant/monostate.h" } -module std_private_array_array_fwd [system] { header "__fwd/array.h" } + header "variant" + export * + } -module std_private_atomic_aliases [system] { - header "__atomic/aliases.h" - export std_private_atomic_atomic -} -module std_private_atomic_atomic [system] { - header "__atomic/atomic.h" - export std_private_atomic_atomic_base -} -module std_private_atomic_atomic_base [system] { header "__atomic/atomic_base.h" } -module std_private_atomic_atomic_flag [system] { - header "__atomic/atomic_flag.h" - export * -} -module std_private_atomic_atomic_init [system] { header "__atomic/atomic_init.h" } -module std_private_atomic_atomic_lock_free [system] { header "__atomic/atomic_lock_free.h" } -module std_private_atomic_atomic_ref [system] { header "__atomic/atomic_ref.h" } -module std_private_atomic_atomic_sync [system] { - header "__atomic/atomic_sync.h" - export std_private_atomic_to_gcc_order -} -module std_private_atomic_check_memory_order [system] { header "__atomic/check_memory_order.h" } -module std_private_atomic_contention_t [system] { header "__atomic/contention_t.h" } -module std_private_atomic_cxx_atomic_impl [system] { header "__atomic/cxx_atomic_impl.h" } -module std_private_atomic_fence [system] { header "__atomic/fence.h" } -module std_private_atomic_is_always_lock_free [system] { header "__atomic/is_always_lock_free.h" } -module std_private_atomic_kill_dependency [system] { header "__atomic/kill_dependency.h" } -module std_private_atomic_memory_order [system] { header "__atomic/memory_order.h" } -module std_private_atomic_to_gcc_order [system] { - header "__atomic/to_gcc_order.h" - export std_private_atomic_memory_order -} + module vector { + module fwd { header "__fwd/vector.h" } -module std_private_bit_bit_cast [system] { header "__bit/bit_cast.h" } -module std_private_bit_bit_ceil [system] { header "__bit/bit_ceil.h" } -module std_private_bit_bit_floor [system] { header "__bit/bit_floor.h" } -module std_private_bit_bit_log2 [system] { header "__bit/bit_log2.h" } -module std_private_bit_bit_width [system] { header "__bit/bit_width.h" } -module std_private_bit_blsr [system] { header "__bit/blsr.h" } -module std_private_bit_byteswap [system] { header "__bit/byteswap.h" } -module std_private_bit_countl [system] { header "__bit/countl.h" } -module std_private_bit_countr [system] { header "__bit/countr.h" } -module std_private_bit_endian [system] { header "__bit/endian.h" } -module std_private_bit_has_single_bit [system] { header "__bit/has_single_bit.h" } -module std_private_bit_invert_if [system] { header "__bit/invert_if.h" } -module std_private_bit_popcount [system] { header "__bit/popcount.h" } -module std_private_bit_rotate [system] { header "__bit/rotate.h" } - -module std_private_chrono_calendar [system] { header "__chrono/calendar.h" } -module std_private_chrono_concepts [system] { header "__chrono/concepts.h" } -module std_private_chrono_convert_to_timespec [system] { header "__chrono/convert_to_timespec.h" } -module std_private_chrono_convert_to_tm [system] { header "__chrono/convert_to_tm.h" } -module std_private_chrono_day [system] { header "__chrono/day.h" } -module std_private_chrono_duration [system] { - header "__chrono/duration.h" - export std_private_type_traits_is_convertible -} -module std_private_chrono_exception [system] { header "__chrono/exception.h" } -module std_private_chrono_file_clock [system] { header "__chrono/file_clock.h" } -module std_private_chrono_formatter [system] { - header "__chrono/formatter.h" -} -module std_private_chrono_hh_mm_ss [system] { header "__chrono/hh_mm_ss.h" } -module std_private_chrono_high_resolution_clock [system] { - header "__chrono/high_resolution_clock.h" - export std_private_chrono_steady_clock - export std_private_chrono_system_clock -} -module std_private_chrono_leap_second [system] { header "__chrono/leap_second.h" } -module std_private_chrono_literals [system] { header "__chrono/literals.h" } -module std_private_chrono_local_info [system] { - header "__chrono/local_info.h" - export std_private_chrono_sys_info -} -module std_private_chrono_month [system] { header "__chrono/month.h" } -module std_private_chrono_month_weekday [system] { header "__chrono/month_weekday.h" } -module std_private_chrono_monthday [system] { header "__chrono/monthday.h" } -module std_private_chrono_ostream [system] { - header "__chrono/ostream.h" -} -module std_private_chrono_parser_std_format_spec [system] { - header "__chrono/parser_std_format_spec.h" -} -module std_private_chrono_statically_widen [system] { header "__chrono/statically_widen.h" } -module std_private_chrono_steady_clock [system] { - header "__chrono/steady_clock.h" - export std_private_chrono_time_point -} -module std_private_chrono_time_zone [system] { - header "__chrono/time_zone.h" - export std_private_memory_unique_ptr -} -module std_private_chrono_time_zone_link [system] { - header "__chrono/time_zone_link.h" -} -module std_private_chrono_sys_info [system] { - header "__chrono/sys_info.h" -} -module std_private_chrono_system_clock [system] { - header "__chrono/system_clock.h" - export std_private_chrono_time_point -} -module std_private_chrono_tzdb [system] { - header "__chrono/tzdb.h" - export * -} -module std_private_chrono_tzdb_list [system] { - header "__chrono/tzdb_list.h" - export * -} -module std_private_chrono_time_point [system] { header "__chrono/time_point.h" } -module std_private_chrono_weekday [system] { header "__chrono/weekday.h" } -module std_private_chrono_year [system] { header "__chrono/year.h" } -module std_private_chrono_year_month [system] { header "__chrono/year_month.h" } -module std_private_chrono_year_month_day [system] { header "__chrono/year_month_day.h" } -module std_private_chrono_year_month_weekday [system] { header "__chrono/year_month_weekday.h" } -module std_private_chrono_zoned_time [system] { header "__chrono/zoned_time.h" } - -module std_private_compare_common_comparison_category [system] { header "__compare/common_comparison_category.h" } -module std_private_compare_compare_partial_order_fallback [system] { header "__compare/compare_partial_order_fallback.h" } -module std_private_compare_compare_strong_order_fallback [system] { header "__compare/compare_strong_order_fallback.h" } -module std_private_compare_compare_three_way [system] { header "__compare/compare_three_way.h" } -module std_private_compare_compare_three_way_result [system] { header "__compare/compare_three_way_result.h" } -module std_private_compare_compare_weak_order_fallback [system] { header "__compare/compare_weak_order_fallback.h" } -module std_private_compare_is_eq [system] { header "__compare/is_eq.h" } -module std_private_compare_ordering [system] { header "__compare/ordering.h" } -module std_private_compare_partial_order [system] { header "__compare/partial_order.h" } -module std_private_compare_strong_order [system] { header "__compare/strong_order.h" } -module std_private_compare_synth_three_way [system] { header "__compare/synth_three_way.h" } -module std_private_compare_three_way_comparable [system] { header "__compare/three_way_comparable.h" } -module std_private_compare_weak_order [system] { header "__compare/weak_order.h" } - -module std_private_complex_complex_fwd [system] { header "__fwd/complex.h" } - -module std_private_concepts_arithmetic [system] { header "__concepts/arithmetic.h" } -module std_private_concepts_assignable [system] { header "__concepts/assignable.h" } -module std_private_concepts_boolean_testable [system] { header "__concepts/boolean_testable.h" } -module std_private_concepts_class_or_enum [system] { header "__concepts/class_or_enum.h" } -module std_private_concepts_common_reference_with [system] { header "__concepts/common_reference_with.h" } -module std_private_concepts_common_with [system] { header "__concepts/common_with.h" } -module std_private_concepts_constructible [system] { - header "__concepts/constructible.h" - export std_private_concepts_destructible -} -module std_private_concepts_convertible_to [system] { header "__concepts/convertible_to.h" } -module std_private_concepts_copyable [system] { header "__concepts/copyable.h" } -module std_private_concepts_derived_from [system] { header "__concepts/derived_from.h" } -module std_private_concepts_destructible [system] { - header "__concepts/destructible.h" - export std_private_type_traits_is_nothrow_destructible -} -module std_private_concepts_different_from [system] { header "__concepts/different_from.h" } -module std_private_concepts_equality_comparable [system] { - header "__concepts/equality_comparable.h" - export std_private_type_traits_common_reference -} -module std_private_concepts_invocable [system] { header "__concepts/invocable.h" } -module std_private_concepts_movable [system] { - header "__concepts/movable.h" - export std_private_type_traits_is_object -} -module std_private_concepts_predicate [system] { header "__concepts/predicate.h" } -module std_private_concepts_regular [system] { header "__concepts/regular.h" } -module std_private_concepts_relation [system] { header "__concepts/relation.h" } -module std_private_concepts_same_as [system] { - header "__concepts/same_as.h" - export std_private_type_traits_is_same -} -module std_private_concepts_semiregular [system] { header "__concepts/semiregular.h" } -module std_private_concepts_swappable [system] { header "__concepts/swappable.h" } -module std_private_concepts_totally_ordered [system] { header "__concepts/totally_ordered.h" } - -module std_private_debug_utils_randomize_range [system] { header "__debug_utils/randomize_range.h" } -module std_private_debug_utils_sanitizers [system] { header "__debug_utils/sanitizers.h" } -module std_private_debug_utils_strict_weak_ordering_check [system] { - header "__debug_utils/strict_weak_ordering_check.h" - export std_private_type_traits_is_constant_evaluated -} + header "vector" + export * + } -module std_private_deque_fwd [system] { header "__fwd/deque.h" } + // Experimental C++ Standard Library interfaces + module experimental { + module iterator { header "experimental/iterator" } + module memory { header "experimental/memory" } + module propagate_const { header "experimental/propagate_const" } + module type_traits { header "experimental/type_traits" } + module utility { header "experimental/utility" } + module simd { + private header "experimental/__simd/aligned_tag.h" + private header "experimental/__simd/declaration.h" + private header "experimental/__simd/reference.h" + private header "experimental/__simd/scalar.h" + private header "experimental/__simd/simd_mask.h" + private header "experimental/__simd/simd.h" + private header "experimental/__simd/traits.h" + private header "experimental/__simd/utility.h" + private header "experimental/__simd/vec_ext.h" + header "experimental/simd" + export * + } + } -module std_private_exception_exception [system] { header "__exception/exception.h" } -module std_private_exception_exception_ptr [system] { - header "__exception/exception_ptr.h" - export std_private_exception_operations -} -module std_private_exception_nested_exception [system] { header "__exception/nested_exception.h" } -module std_private_exception_operations [system] { header "__exception/operations.h" } -module std_private_exception_terminate [system] { header "__exception/terminate.h" } - -module std_private_expected_bad_expected_access [system] { header "__expected/bad_expected_access.h" } -module std_private_expected_expected [system] { header "__expected/expected.h" } -module std_private_expected_unexpect [system] { header "__expected/unexpect.h" } -module std_private_expected_unexpected [system] { header "__expected/unexpected.h" } - -module std_private_format_buffer [system] { header "__format/buffer.h" } -module std_private_format_concepts [system] { header "__format/concepts.h" } -module std_private_format_container_adaptor [system] { header "__format/container_adaptor.h" } -module std_private_format_enable_insertable [system] { header "__format/enable_insertable.h" } -module std_private_format_escaped_output_table [system] { header "__format/escaped_output_table.h" } -module std_private_format_extended_grapheme_cluster_table [system] { header "__format/extended_grapheme_cluster_table.h" } -module std_private_format_format_arg [system] { header "__format/format_arg.h" } -module std_private_format_format_arg_store [system] { header "__format/format_arg_store.h" } -module std_private_format_format_args [system] { header "__format/format_args.h" } -module std_private_format_format_context [system] { - header "__format/format_context.h" - export * -} -module std_private_format_format_error [system] { header "__format/format_error.h" } -module std_private_format_format_functions [system] { - header "__format/format_functions.h" - export std_string -} -module std_private_format_fwd [system] { header "__fwd/format.h" } -module std_private_format_format_parse_context [system] { header "__format/format_parse_context.h" } -module std_private_format_format_string [system] { header "__format/format_string.h" } -module std_private_format_format_to_n_result [system] { - header "__format/format_to_n_result.h" - export std_private_iterator_incrementable_traits -} -module std_private_format_formatter [system] { header "__format/formatter.h" } -module std_private_format_formatter_bool [system] { header "__format/formatter_bool.h" } -module std_private_format_formatter_char [system] { header "__format/formatter_char.h" } -module std_private_format_formatter_floating_point [system] { header "__format/formatter_floating_point.h" } -module std_private_format_formatter_integer [system] { header "__format/formatter_integer.h" } -module std_private_format_formatter_integral [system] { header "__format/formatter_integral.h" } -module std_private_format_formatter_output [system] { header "__format/formatter_output.h" } -module std_private_format_formatter_pointer [system] { header "__format/formatter_pointer.h" } -module std_private_format_formatter_string [system] { header "__format/formatter_string.h" } -module std_private_format_formatter_tuple [system] { header "__format/formatter_tuple.h" } -module std_private_format_indic_conjunct_break_table [system] { header "__format/indic_conjunct_break_table.h" } -module std_private_format_parser_std_format_spec [system] { header "__format/parser_std_format_spec.h" } -module std_private_format_range_default_formatter [system] { header "__format/range_default_formatter.h" } -module std_private_format_range_formatter [system] { header "__format/range_formatter.h" } -module std_private_format_unicode [system] { - header "__format/unicode.h" - export std_private_format_extended_grapheme_cluster_table - export std_private_format_indic_conjunct_break_table -} -module std_private_format_width_estimation_table [system] { header "__format/width_estimation_table.h" } -module std_private_format_write_escaped [system] { header "__format/write_escaped.h" } - -module std_private_functional_binary_function [system] { header "__functional/binary_function.h" } -module std_private_functional_binary_negate [system] { header "__functional/binary_negate.h" } -module std_private_functional_bind [system] { header "__functional/bind.h" } -module std_private_functional_bind_back [system] { header "__functional/bind_back.h" } -module std_private_functional_bind_front [system] { header "__functional/bind_front.h" } -module std_private_functional_binder1st [system] { header "__functional/binder1st.h" } -module std_private_functional_binder2nd [system] { header "__functional/binder2nd.h" } -module std_private_functional_boyer_moore_searcher [system] { - header "__functional/boyer_moore_searcher.h" - export std_private_memory_shared_ptr -} -module std_private_functional_compose [system] { - header "__functional/compose.h" - export std_private_functional_perfect_forward -} -module std_private_functional_default_searcher [system] { header "__functional/default_searcher.h" } -module std_private_functional_function [system] { header "__functional/function.h" } -module std_private_functional_hash [system] { - header "__functional/hash.h" - export std_cstdint - export std_private_type_traits_underlying_type - export std_private_utility_pair -} -module std_private_functional_fwd [system] { header "__fwd/functional.h" } -module std_private_functional_identity [system] { header "__functional/identity.h" } -module std_private_functional_invoke [system] { - header "__functional/invoke.h" - export * -} -module std_private_functional_is_transparent [system] { header "__functional/is_transparent.h" } -module std_private_functional_mem_fn [system] { header "__functional/mem_fn.h" } -module std_private_functional_mem_fun_ref [system] { header "__functional/mem_fun_ref.h" } -module std_private_functional_not_fn [system] { - header "__functional/not_fn.h" - export std_private_functional_perfect_forward -} -module std_private_functional_operations [system] { header "__functional/operations.h" } -module std_private_functional_perfect_forward [system] { - header "__functional/perfect_forward.h" - export * -} -module std_private_functional_pointer_to_binary_function [system] { header "__functional/pointer_to_binary_function.h" } -module std_private_functional_pointer_to_unary_function [system] { header "__functional/pointer_to_unary_function.h" } -module std_private_functional_ranges_operations [system] { header "__functional/ranges_operations.h" } -module std_private_functional_reference_wrapper [system] { header "__functional/reference_wrapper.h" } -module std_private_functional_unary_function [system] { header "__functional/unary_function.h" } -module std_private_functional_unary_negate [system] { header "__functional/unary_negate.h" } -module std_private_functional_weak_result_type [system] { header "__functional/weak_result_type.h" } - -module std_private_ios_fpos [system] { header "__ios/fpos.h" } - -module std_private_iosfwd_fstream_fwd [system] { header "__fwd/fstream.h" } -module std_private_iosfwd_ios_fwd [system] { header "__fwd/ios.h" } -module std_private_iosfwd_istream_fwd [system] { header "__fwd/istream.h" } -module std_private_iosfwd_ostream_fwd [system] { header "__fwd/ostream.h" } -module std_private_iosfwd_sstream_fwd [system] { header "__fwd/sstream.h" } -module std_private_iosfwd_streambuf_fwd [system] { header "__fwd/streambuf.h" } - -module std_private_iterator_access [system] { header "__iterator/access.h" } -module std_private_iterator_advance [system] { header "__iterator/advance.h" } -module std_private_iterator_aliasing_iterator [system] { header "__iterator/aliasing_iterator.h" } -module std_private_iterator_back_insert_iterator [system] { header "__iterator/back_insert_iterator.h" } -module std_private_iterator_bounded_iter [system] { header "__iterator/bounded_iter.h" } -module std_private_iterator_common_iterator [system] { header "__iterator/common_iterator.h" } -module std_private_iterator_concepts [system] { - header "__iterator/concepts.h" - export std_private_concepts_constructible - export std_private_concepts_equality_comparable - export std_private_concepts_movable - export std_private_type_traits_common_reference - export std_private_type_traits_is_reference - export std_private_type_traits_remove_cvref -} -module std_private_iterator_counted_iterator [system] { header "__iterator/counted_iterator.h" } -module std_private_iterator_cpp17_iterator_concepts [system] { header "__iterator/cpp17_iterator_concepts.h" } -module std_private_iterator_data [system] { header "__iterator/data.h" } -module std_private_iterator_default_sentinel [system] { header "__iterator/default_sentinel.h" } -module std_private_iterator_distance [system] { - header "__iterator/distance.h" - export std_private_ranges_size -} -module std_private_iterator_empty [system] { header "__iterator/empty.h" } -module std_private_iterator_erase_if_container [system] { header "__iterator/erase_if_container.h" } -module std_private_iterator_front_insert_iterator [system] { header "__iterator/front_insert_iterator.h" } -module std_private_iterator_incrementable_traits [system] { header "__iterator/incrementable_traits.h" } -module std_private_iterator_indirectly_comparable [system] { header "__iterator/indirectly_comparable.h" } -module std_private_iterator_insert_iterator [system] { header "__iterator/insert_iterator.h" } -module std_private_iterator_istream_iterator [system] { header "__iterator/istream_iterator.h" } -module std_private_iterator_istreambuf_iterator [system] { header "__iterator/istreambuf_iterator.h" } -module std_private_iterator_iter_move [system] { header "__iterator/iter_move.h" } -module std_private_iterator_iter_swap [system] { header "__iterator/iter_swap.h" } -module std_private_iterator_iterator [system] { header "__iterator/iterator.h" } -module std_private_iterator_iterator_traits [system] { - header "__iterator/iterator_traits.h" - export std_private_type_traits_is_primary_template - export std_private_type_traits_integral_constant -} -module std_private_iterator_iterator_with_data [system] { header "__iterator/iterator_with_data.h" } -module std_private_iterator_mergeable [system] { - header "__iterator/mergeable.h" - export std_private_functional_ranges_operations -} -module std_private_iterator_move_iterator [system] { header "__iterator/move_iterator.h" } -module std_private_iterator_move_sentinel [system] { header "__iterator/move_sentinel.h" } -module std_private_iterator_next [system] { header "__iterator/next.h" } -module std_private_iterator_ostream_iterator [system] { header "__iterator/ostream_iterator.h" } -module std_private_iterator_ostreambuf_iterator [system] { - header "__iterator/ostreambuf_iterator.h" - export * -} -module std_private_iterator_permutable [system] { header "__iterator/permutable.h" } -module std_private_iterator_prev [system] { header "__iterator/prev.h" } -module std_private_iterator_projected [system] { header "__iterator/projected.h" } -module std_private_iterator_ranges_iterator_traits [system] { header "__iterator/ranges_iterator_traits.h" } -module std_private_iterator_readable_traits [system] { header "__iterator/readable_traits.h" } -module std_private_iterator_reverse_access [system] { header "__iterator/reverse_access.h" } -module std_private_iterator_reverse_iterator [system] { header "__iterator/reverse_iterator.h" } -module std_private_iterator_segmented_iterator [system] { header "__iterator/segmented_iterator.h" } -module std_private_iterator_size [system] { header "__iterator/size.h" } -module std_private_iterator_sortable [system] { - header "__iterator/sortable.h" - export std_private_functional_ranges_operations -} -module std_private_iterator_unreachable_sentinel [system] { header "__iterator/unreachable_sentinel.h" } -module std_private_iterator_wrap_iter [system] { header "__iterator/wrap_iter.h" } - -module std_private_locale_locale_base_api_android [system] { textual header "__locale_dir/locale_base_api/android.h" } -module std_private_locale_locale_base_api_bsd_locale_defaults [system] { textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h" } -module std_private_locale_locale_base_api_bsd_locale_fallbacks [system] { textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" } -module std_private_locale_locale_base_api_fuchsia [system] { textual header "__locale_dir/locale_base_api/fuchsia.h" } -module std_private_locale_locale_base_api_ibm [system] { textual header "__locale_dir/locale_base_api/ibm.h" } -module std_private_locale_locale_base_api_locale_guard [system] { header "__locale_dir/locale_base_api/locale_guard.h" } -module std_private_locale_locale_base_api_musl [system] { textual header "__locale_dir/locale_base_api/musl.h" } -module std_private_locale_locale_base_api_newlib [system] { textual header "__locale_dir/locale_base_api/newlib.h" } -module std_private_locale_locale_base_api_openbsd [system] { textual header "__locale_dir/locale_base_api/openbsd.h" } -module std_private_locale_locale_base_api_win32 [system] { textual header "__locale_dir/locale_base_api/win32.h" } -module std_private_locale_locale_base_api [system] { - header "__locale_dir/locale_base_api.h" - export * -} + // Implementation detail headers that are private to libc++. These modules + // must not be directly imported. + module debug_utils { + module randomize_range { header "__debug_utils/randomize_range.h" } + module sanitizers { header "__debug_utils/sanitizers.h" } + module strict_weak_ordering_check { header "__debug_utils/strict_weak_ordering_check.h" } + } -module std_private_math_abs [system] { header "__math/abs.h" } -module std_private_math_copysign [system] { header "__math/copysign.h" } -module std_private_math_error_functions [system] { header "__math/error_functions.h" } -module std_private_math_exponential_functions [system] { header "__math/exponential_functions.h" } -module std_private_math_fdim [system] { header "__math/fdim.h" } -module std_private_math_fma [system] { header "__math/fma.h" } -module std_private_math_gamma [system] { header "__math/gamma.h" } -module std_private_math_hyperbolic_functions [system] { header "__math/hyperbolic_functions.h" } -module std_private_math_hypot [system] { header "__math/hypot.h" } -module std_private_math_inverse_hyperbolic_functions [system] { header "__math/inverse_hyperbolic_functions.h" } -module std_private_math_inverse_trigonometric_functions [system] { header "__math/inverse_trigonometric_functions.h" } -module std_private_math_logarithms [system] { header "__math/logarithms.h" } -module std_private_math_min_max [system] { header "__math/min_max.h" } -module std_private_math_modulo [system] { header "__math/modulo.h" } -module std_private_math_remainder [system] { header "__math/remainder.h" } -module std_private_math_roots [system] { header "__math/roots.h" } -module std_private_math_rounding_functions [system] { header "__math/rounding_functions.h" } -module std_private_math_special_functions [system] { header "__math/special_functions.h" } -module std_private_math_traits [system] { header "__math/traits.h" } -module std_private_math_trigonometric_functions [system] { header "__math/trigonometric_functions.h" } - -module std_private_memory_addressof [system] { header "__memory/addressof.h" } -module std_private_memory_align [system] { header "__memory/align.h" } -module std_private_memory_aligned_alloc [system] { header "__memory/aligned_alloc.h" } -module std_private_memory_allocate_at_least [system] { header "__memory/allocate_at_least.h" } -module std_private_memory_allocation_guard [system] { header "__memory/allocation_guard.h" } -module std_private_memory_allocator [system] { header "__memory/allocator.h" } -module std_private_memory_allocator_arg_t [system] { header "__memory/allocator_arg_t.h" } -module std_private_memory_allocator_destructor [system] { header "__memory/allocator_destructor.h" } -module std_private_memory_allocator_traits [system] { header "__memory/allocator_traits.h" } -module std_private_memory_array_cookie [system] { header "__memory/array_cookie.h" } -module std_private_memory_assume_aligned [system] { header "__memory/assume_aligned.h" } -module std_private_memory_auto_ptr [system] { header "__memory/auto_ptr.h" } -module std_private_memory_builtin_new_allocator [system] { - header "__memory/builtin_new_allocator.h" - export * -} -module std_private_memory_compressed_pair [system] { header "__memory/compressed_pair.h" } -module std_private_memory_concepts [system] { - header "__memory/concepts.h" - export std_private_type_traits_remove_reference -} -module std_private_memory_construct_at [system] { header "__memory/construct_at.h" } -module std_private_memory_destruct_n [system] { header "__memory/destruct_n.h" } -module std_private_memory_fwd [system] { header "__fwd/memory.h" } -module std_private_memory_inout_ptr [system] { header "__memory/inout_ptr.h" } -module std_private_memory_noexcept_move_assign_container [system] { header "__memory/noexcept_move_assign_container.h" } -module std_private_memory_out_ptr [system] { header "__memory/out_ptr.h" } -module std_private_memory_pointer_traits [system] { header "__memory/pointer_traits.h" } -module std_private_memory_ranges_construct_at [system] { header "__memory/ranges_construct_at.h" } -module std_private_memory_ranges_uninitialized_algorithms [system] { - header "__memory/ranges_uninitialized_algorithms.h" - export std_private_algorithm_in_out_result -} -module std_private_memory_raw_storage_iterator [system] { header "__memory/raw_storage_iterator.h" } -module std_private_memory_shared_ptr [system] { - header "__memory/shared_ptr.h" - export std_private_memory_uninitialized_algorithms -} -module std_private_memory_swap_allocator [system] { header "__memory/swap_allocator.h" } -module std_private_memory_temp_value [system] { header "__memory/temp_value.h" } -module std_private_memory_temporary_buffer [system] { - header "__memory/temporary_buffer.h" - export std_private_utility_pair -} -module std_private_memory_uninitialized_algorithms [system] { - header "__memory/uninitialized_algorithms.h" - export std_private_algorithm_copy -} -module std_private_memory_unique_ptr [system] { - header "__memory/unique_ptr.h" - export std_private_type_traits_add_lvalue_reference - export std_private_type_traits_is_pointer - export std_private_type_traits_type_identity -} -module std_private_memory_unique_temporary_buffer [system] { - header "__memory/unique_temporary_buffer.h" - export std_private_memory_unique_ptr - export std_private_type_traits_is_constant_evaluated -} -module std_private_memory_uses_allocator [system] { header "__memory/uses_allocator.h" } -module std_private_memory_uses_allocator_construction [system] { header "__memory/uses_allocator_construction.h" } - -module std_private_memory_resource_memory_resource [system] { header "__memory_resource/memory_resource.h" } -module std_private_memory_resource_memory_resource_fwd [system] { header "__fwd/memory_resource.h" } -module std_private_memory_resource_monotonic_buffer_resource [system] { header "__memory_resource/monotonic_buffer_resource.h" } -module std_private_memory_resource_polymorphic_allocator [system] { header "__memory_resource/polymorphic_allocator.h" } -module std_private_memory_resource_pool_options [system] { header "__memory_resource/pool_options.h" } -module std_private_memory_resource_synchronized_pool_resource [system] { - header "__memory_resource/synchronized_pool_resource.h" - export * -} -module std_private_memory_resource_unsynchronized_pool_resource [system] { header "__memory_resource/unsynchronized_pool_resource.h" } - -module std_private_mutex_lock_guard [system] { header "__mutex/lock_guard.h" } -module std_private_mutex_mutex [system] { header "__mutex/mutex.h" } -module std_private_mutex_once_flag [system] { header "__mutex/once_flag.h" } -module std_private_mutex_tag_types [system] { header "__mutex/tag_types.h" } -module std_private_mutex_unique_lock [system] { header "__mutex/unique_lock.h" } - -module std_private_numeric_accumulate [system] { header "__numeric/accumulate.h" } -module std_private_numeric_adjacent_difference [system] { header "__numeric/adjacent_difference.h" } -module std_private_numeric_exclusive_scan [system] { header "__numeric/exclusive_scan.h" } -module std_private_numeric_gcd_lcm [system] { header "__numeric/gcd_lcm.h" } -module std_private_numeric_inclusive_scan [system] { header "__numeric/inclusive_scan.h" } -module std_private_numeric_inner_product [system] { header "__numeric/inner_product.h" } -module std_private_numeric_iota [system] { header "__numeric/iota.h" } -module std_private_numeric_midpoint [system] { header "__numeric/midpoint.h" } -module std_private_numeric_partial_sum [system] { header "__numeric/partial_sum.h" } -module std_private_numeric_pstl [system] { - header "__numeric/pstl.h" - export * -} -module std_private_numeric_reduce [system] { header "__numeric/reduce.h" } -module std_private_numeric_saturation_arithmetic [system] { header "__numeric/saturation_arithmetic.h" } -module std_private_numeric_transform_exclusive_scan [system] { header "__numeric/transform_exclusive_scan.h" } -module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } -module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } - -module std_private_pstl [system] { - header "__pstl/backend.h" - header "__pstl/backend_fwd.h" - header "__pstl/backends/default.h" - header "__pstl/backends/libdispatch.h" - header "__pstl/backends/serial.h" - header "__pstl/backends/std_thread.h" - header "__pstl/cpu_algos/any_of.h" - header "__pstl/cpu_algos/cpu_traits.h" - header "__pstl/cpu_algos/fill.h" - header "__pstl/cpu_algos/find_if.h" - header "__pstl/cpu_algos/for_each.h" - header "__pstl/cpu_algos/merge.h" - header "__pstl/cpu_algos/stable_sort.h" - header "__pstl/cpu_algos/transform.h" - header "__pstl/cpu_algos/transform_reduce.h" - header "__pstl/dispatch.h" - header "__pstl/handle_exception.h" -} + module get_fwd { + header "__fwd/get.h" + export std_core.fwd.pair + export std_core.fwd.tuple + export std.array.fwd + export std.complex.fwd + export std.ranges.subrange_fwd + export std.variant.fwd + } -module std_private_queue_fwd [system] { header "__fwd/queue.h" } + module pstl { + module backend_fwd { + header "__pstl/backend_fwd.h" + } + module backend { + header "__pstl/backend.h" + export * // need to export everything from whatever backend is currently configured + } + module backends { + module default { + header "__pstl/backends/default.h" + export std_core.utility_core.empty + } + module libdispatch { + header "__pstl/backends/libdispatch.h" + export std.pstl.cpu_algos + export std_core.utility_core.empty + } + module serial { + header "__pstl/backends/serial.h" + export std_core.utility_core.empty + } + module std_thread { + header "__pstl/backends/std_thread.h" + export std.pstl.cpu_algos + export std_core.utility_core.empty + } + } + module cpu_algos { + module any_of { + header "__pstl/cpu_algos/any_of.h" + } + module cpu_traits { + header "__pstl/cpu_algos/cpu_traits.h" + } + module fill { + header "__pstl/cpu_algos/fill.h" + export std_core.utility_core.empty + } + module find_if { + header "__pstl/cpu_algos/find_if.h" + } + module for_each { + header "__pstl/cpu_algos/for_each.h" + export std_core.utility_core.empty + } + module merge { + header "__pstl/cpu_algos/merge.h" + } + module stable_sort { + header "__pstl/cpu_algos/stable_sort.h" + export std_core.utility_core.empty + } + module transform { + header "__pstl/cpu_algos/transform.h" + } + module transform_reduce { + header "__pstl/cpu_algos/transform_reduce.h" + } + } + module dispatch { header "__pstl/dispatch.h" } + module handle_exception { header "__pstl/handle_exception.h" } + } -module std_private_ostream_basic_ostream [system] { - header "__ostream/basic_ostream.h" - export std_streambuf -} -module std_private_ostream_print [system] { - header "__ostream/print.h" - export std_print -} + // Miscellaneous modules for top-level headers + module bit_reference_fwd { + header "__fwd/bit_reference.h" + } + module bit_reference { + header "__bit_reference" + export std.bit_reference_fwd + } + module hash_table { header "__hash_table" } + module node_handle { header "__node_handle" } + module split_buffer { header "__split_buffer" } + module tree { header "__tree" } + module std_mbstate_t { + header "__std_mbstate_t.h" + export * + } + module verbose_abort { + header "__verbose_abort" + } + module internal_assert { + header "__assert" + export * + } + + module undef_macros { + textual header "__undef_macros" + } + + // This module needs to appear after __tree to work around issues with modules in Objective-C++ mode. + module coroutine { + module coroutine_handle { header "__coroutine/coroutine_handle.h" } + module coroutine_traits { header "__coroutine/coroutine_traits.h" } + module noop_coroutine_handle { header "__coroutine/noop_coroutine_handle.h" } + module trivial_awaitables { header "__coroutine/trivial_awaitables.h" } -module std_private_random_bernoulli_distribution [system] { header "__random/bernoulli_distribution.h" } -module std_private_random_binomial_distribution [system] { header "__random/binomial_distribution.h" } -module std_private_random_cauchy_distribution [system] { header "__random/cauchy_distribution.h" } -module std_private_random_chi_squared_distribution [system] { header "__random/chi_squared_distribution.h" } -module std_private_random_clamp_to_integral [system] { header "__random/clamp_to_integral.h" } -module std_private_random_default_random_engine [system] { header "__random/default_random_engine.h" } -module std_private_random_discard_block_engine [system] { header "__random/discard_block_engine.h" } -module std_private_random_discrete_distribution [system] { - header "__random/discrete_distribution.h" + header "coroutine" + export * + } +} // module std + +// C compatibility headers +// +// These modules need to be their own top-level modules because they depend on the system-provided +// headers (via include_next), which are then free to include other C headers provided by libc++. +// If we group these headers in a single module, we would end up with circular dependencies. +module std_complex_h [system] { + header "complex.h" export * } -module std_private_random_exponential_distribution [system] { header "__random/exponential_distribution.h" } -module std_private_random_extreme_value_distribution [system] { header "__random/extreme_value_distribution.h" } -module std_private_random_fisher_f_distribution [system] { header "__random/fisher_f_distribution.h" } -module std_private_random_gamma_distribution [system] { header "__random/gamma_distribution.h" } -module std_private_random_generate_canonical [system] { header "__random/generate_canonical.h" } -module std_private_random_geometric_distribution [system] { header "__random/geometric_distribution.h" } -module std_private_random_independent_bits_engine [system] { header "__random/independent_bits_engine.h" } -module std_private_random_is_seed_sequence [system] { header "__random/is_seed_sequence.h" } -module std_private_random_is_valid [system] { header "__random/is_valid.h" } -module std_private_random_knuth_b [system] { header "__random/knuth_b.h" } -module std_private_random_linear_congruential_engine [system] { header "__random/linear_congruential_engine.h" } -module std_private_random_log2 [system] { header "__random/log2.h" } -module std_private_random_lognormal_distribution [system] { header "__random/lognormal_distribution.h" } -module std_private_random_mersenne_twister_engine [system] { header "__random/mersenne_twister_engine.h" } -module std_private_random_negative_binomial_distribution [system] { header "__random/negative_binomial_distribution.h" } -module std_private_random_normal_distribution [system] { header "__random/normal_distribution.h" } -module std_private_random_piecewise_constant_distribution [system] { - header "__random/piecewise_constant_distribution.h" +module std_ctype_h [system] { + header "ctype.h" export * } -module std_private_random_piecewise_linear_distribution [system] { - header "__random/piecewise_linear_distribution.h" +module std_errno_h [system] { + header "errno.h" export * } -module std_private_random_poisson_distribution [system] { header "__random/poisson_distribution.h" } -module std_private_random_random_device [system] { - header "__random/random_device.h" +module std_fenv_h [system] { + header "fenv.h" export * } -module std_private_random_ranlux [system] { header "__random/ranlux.h" } -module std_private_random_seed_seq [system] { - header "__random/seed_seq.h" +module std_float_h [system] { + header "float.h" export * } -module std_private_random_shuffle_order_engine [system] { header "__random/shuffle_order_engine.h" } -module std_private_random_student_t_distribution [system] { header "__random/student_t_distribution.h" } -module std_private_random_subtract_with_carry_engine [system] { header "__random/subtract_with_carry_engine.h" } -module std_private_random_uniform_int_distribution [system] { header "__random/uniform_int_distribution.h" } -module std_private_random_uniform_random_bit_generator [system] { header "__random/uniform_random_bit_generator.h" } -module std_private_random_uniform_real_distribution [system] { header "__random/uniform_real_distribution.h" } -module std_private_random_weibull_distribution [system] { header "__random/weibull_distribution.h" } - -module std_private_ranges_access [system] { header "__ranges/access.h" } -module std_private_ranges_all [system] { - header "__ranges/all.h" - export std_private_functional_compose - export std_private_functional_perfect_forward - export std_private_ranges_owning_view -} -module std_private_ranges_as_rvalue_view [system] { header "__ranges/as_rvalue_view.h" } -module std_private_ranges_chunk_by_view [system] { header "__ranges/chunk_by_view.h" } -module std_private_ranges_common_view [system] { header "__ranges/common_view.h" } -module std_private_ranges_concepts [system] { - header "__ranges/concepts.h" - export std_private_iterator_concepts -} -module std_private_ranges_container_compatible_range [system] { header "__ranges/container_compatible_range.h" } -module std_private_ranges_counted [system] { - header "__ranges/counted.h" - export std_span -} -module std_private_ranges_dangling [system] { header "__ranges/dangling.h" } -module std_private_ranges_data [system] { header "__ranges/data.h" } -module std_private_ranges_drop_view [system] { header "__ranges/drop_view.h" } -module std_private_ranges_drop_while_view [system] { header "__ranges/drop_while_view.h" } -module std_private_ranges_elements_view [system] { header "__ranges/elements_view.h" } -module std_private_ranges_empty [system] { header "__ranges/empty.h" } -module std_private_ranges_empty_view [system] { header "__ranges/empty_view.h" } -module std_private_ranges_enable_borrowed_range [system] { header "__ranges/enable_borrowed_range.h" } -module std_private_ranges_enable_view [system] { header "__ranges/enable_view.h" } -module std_private_ranges_filter_view [system] { - header "__ranges/filter_view.h" - export std_private_ranges_range_adaptor -} -module std_private_ranges_from_range [system] { header "__ranges/from_range.h" } -module std_private_ranges_iota_view [system] { header "__ranges/iota_view.h" } -module std_private_ranges_istream_view [system] { - header "__ranges/istream_view.h" -} -module std_private_ranges_join_view [system] { - header "__ranges/join_view.h" - export std_private_iterator_iterator_with_data - export std_private_iterator_segmented_iterator -} -module std_private_ranges_lazy_split_view [system] { - header "__ranges/lazy_split_view.h" - export std_private_ranges_non_propagating_cache +module std_inttypes_h [system] { + header "inttypes.h" + export * } -module std_private_ranges_movable_box [system] { header "__ranges/movable_box.h" } -module std_private_ranges_non_propagating_cache [system] { header "__ranges/non_propagating_cache.h" } -module std_private_ranges_owning_view [system] { header "__ranges/owning_view.h" } -module std_private_ranges_range_adaptor [system] { header "__ranges/range_adaptor.h" } -module std_private_ranges_rbegin [system] { header "__ranges/rbegin.h" } -module std_private_ranges_ref_view [system] { header "__ranges/ref_view.h" } -module std_private_ranges_rend [system] { header "__ranges/rend.h" } -module std_private_ranges_repeat_view [system] { header "__ranges/repeat_view.h" } -module std_private_ranges_reverse_view [system] { header "__ranges/reverse_view.h" } -module std_private_ranges_single_view [system] { header "__ranges/single_view.h" } -module std_private_ranges_size [system] { - header "__ranges/size.h" - export std_private_type_traits_make_unsigned +module std_locale_h [system] { + header "locale.h" + export * } -module std_private_ranges_split_view [system] { header "__ranges/split_view.h" } -module std_private_ranges_subrange [system] { - header "__ranges/subrange.h" - export std_private_ranges_subrange_fwd +module std_math_h [system] { + header "math.h" + export * } -module std_private_ranges_subrange_fwd [system] { - header "__fwd/subrange.h" - export std_private_iterator_concepts +module std_stdatomic_h [system] { + header "stdatomic.h" + export * } -module std_private_ranges_take_view [system] { header "__ranges/take_view.h" } -module std_private_ranges_take_while_view [system] { header "__ranges/take_while_view.h" } -module std_private_ranges_to [system] { header "__ranges/to.h" } -module std_private_ranges_transform_view [system] { - header "__ranges/transform_view.h" - export std_private_functional_bind_back - export std_private_functional_perfect_forward - export std_private_ranges_movable_box +module std_stdbool_h [system] { + // 's __bool_true_false_are_defined macro requires textual inclusion. + textual header "stdbool.h" } -module std_private_ranges_view_interface [system] { header "__ranges/view_interface.h" } -module std_private_ranges_views [system] { header "__ranges/views.h" } -module std_private_ranges_zip_view [system] { - header "__ranges/zip_view.h" - export std_private_utility_pair +module std_stddef_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stddef.h" } - -module std_private_span_span_fwd [system] { header "__fwd/span.h" } - -module std_private_stack_fwd [system] { header "__fwd/stack.h" } - -module std_private_string_char_traits [system] { - header "__string/char_traits.h" +module std_stdint_h [system] { + header "stdint.h" export * } -module std_private_string_constexpr_c_functions [system] { - header "__string/constexpr_c_functions.h" - export std_private_type_traits_is_equality_comparable -} -module std_private_string_extern_template_lists [system] { header "__string/extern_template_lists.h" } -module std_private_string_string_fwd [system] { header "__fwd/string.h" } - -module std_private_string_view_string_view_fwd [system] { header "__fwd/string_view.h" } - -module std_private_system_error_errc [system] { header "__system_error/errc.h" } -module std_private_system_error_error_category [system] { header "__system_error/error_category.h" } -module std_private_system_error_error_code [system] { - header "__system_error/error_code.h" - export std_private_functional_hash - export std_private_functional_unary_function +module std_stdio_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stdio.h" } -module std_private_system_error_error_condition [system] { - header "__system_error/error_condition.h" - export std_private_functional_hash - export std_private_functional_unary_function +module std_stdlib_h [system] { + // 's __need_* macros require textual inclusion. + textual header "stdlib.h" } -module std_private_system_error_system_error [system] { header "__system_error/system_error.h" } - -module std_private_thread_formatter [system] { header "__thread/formatter.h" } -module std_private_thread_id [system] { header "__thread/id.h" } -module std_private_thread_jthread [system] { - header "__thread/jthread.h" +module std_string_h [system] { + header "string.h" export * } -module std_private_thread_poll_with_backoff [system] { header "__thread/poll_with_backoff.h" } -module std_private_thread_support [system] { - header "__thread/support.h" +module std_tgmath_h [system] { + header "tgmath.h" export * } -module std_private_thread_support_c11 [system] { textual header "__thread/support/c11.h" } -module std_private_thread_support_external [system] { textual header "__thread/support/external.h" } -module std_private_thread_support_pthread [system] { textual header "__thread/support/pthread.h" } -module std_private_thread_support_windows [system] { textual header "__thread/support/windows.h" } -module std_private_thread_this_thread [system] { header "__thread/this_thread.h" } -module std_private_thread_thread [system] { - header "__thread/thread.h" +module std_uchar_h [system] { + header "uchar.h" export * } -module std_private_thread_timed_backoff_policy [system] { header "__thread/timed_backoff_policy.h" } - -module std_private_tuple_find_index [system] { header "__tuple/find_index.h" } -module std_private_tuple_ignore [system] { header "__tuple/ignore.h" } -module std_private_tuple_make_tuple_types [system] { header "__tuple/make_tuple_types.h" } -module std_private_tuple_tuple_like_no_subrange [system] { - header "__tuple/tuple_like_no_subrange.h" -} -module std_private_tuple_sfinae_helpers [system] { header "__tuple/sfinae_helpers.h" } -module std_private_tuple_tuple_element [system] { header "__tuple/tuple_element.h" } -module std_private_tuple_tuple_fwd [system] { header "__fwd/tuple.h" } -module std_private_get_fwd [system] { - header "__fwd/get.h" - export std_private_array_array_fwd - export std_private_complex_complex_fwd - export std_private_ranges_subrange_fwd - export std_private_tuple_tuple_fwd - export std_private_utility_pair_fwd - export std_private_variant_fwd +module std_wchar_h [system] { + // 's __need_* macros require textual inclusion. + textual header "wchar.h" } -module std_private_tuple_tuple_indices [system] { header "__tuple/tuple_indices.h" } -module std_private_tuple_tuple_like [system] { - header "__tuple/tuple_like.h" +module std_wctype_h [system] { + header "wctype.h" export * } -module std_private_tuple_tuple_like_ext [system] { header "__tuple/tuple_like_ext.h" } -module std_private_tuple_tuple_size [system] { - header "__tuple/tuple_size.h" - export std_private_type_traits_integral_constant -} -module std_private_tuple_tuple_types [system] { header "__tuple/tuple_types.h" } -module std_private_type_traits_add_const [system] { header "__type_traits/add_const.h" } -module std_private_type_traits_add_cv [system] { header "__type_traits/add_cv.h" } -module std_private_type_traits_add_lvalue_reference [system] { - header "__type_traits/add_lvalue_reference.h" - export std_private_type_traits_is_referenceable -} -module std_private_type_traits_add_pointer [system] { header "__type_traits/add_pointer.h" } -module std_private_type_traits_add_rvalue_reference [system] { header "__type_traits/add_rvalue_reference.h" } -module std_private_type_traits_add_volatile [system] { header "__type_traits/add_volatile.h" } -module std_private_type_traits_aligned_storage [system] { header "__type_traits/aligned_storage.h" } -module std_private_type_traits_aligned_union [system] { header "__type_traits/aligned_union.h" } -module std_private_type_traits_alignment_of [system] { header "__type_traits/alignment_of.h" } -module std_private_type_traits_can_extract_key [system] { header "__type_traits/can_extract_key.h" } -module std_private_type_traits_common_reference [system] { - header "__type_traits/common_reference.h" - export std_private_type_traits_remove_cvref -} -module std_private_type_traits_common_type [system] { - header "__type_traits/common_type.h" - export std_private_type_traits_type_identity - export std_private_utility_declval - export std_private_utility_empty -} -module std_private_type_traits_conditional [system] { header "__type_traits/conditional.h" } -module std_private_type_traits_conjunction [system] { header "__type_traits/conjunction.h" } -module std_private_type_traits_copy_cv [system] { header "__type_traits/copy_cv.h" } -module std_private_type_traits_copy_cvref [system] { header "__type_traits/copy_cvref.h" } -module std_private_type_traits_datasizeof [system] { header "__type_traits/datasizeof.h" } -module std_private_type_traits_decay [system] { - header "__type_traits/decay.h" - export std_private_type_traits_add_pointer -} -module std_private_type_traits_dependent_type [system] { header "__type_traits/dependent_type.h" } -module std_private_type_traits_desugars_to [system] { header "__type_traits/desugars_to.h" } -module std_private_type_traits_disjunction [system] { header "__type_traits/disjunction.h" } -module std_private_type_traits_enable_if [system] { header "__type_traits/enable_if.h" } -module std_private_type_traits_extent [system] { header "__type_traits/extent.h" } -module std_private_type_traits_has_unique_object_representation [system] { header "__type_traits/has_unique_object_representation.h" } -module std_private_type_traits_has_virtual_destructor [system] { header "__type_traits/has_virtual_destructor.h" } -module std_private_type_traits_integral_constant [system] { header "__type_traits/integral_constant.h" } -module std_private_type_traits_invoke [system] { - header "__type_traits/invoke.h" - export std_private_type_traits_conditional - export std_private_type_traits_decay - export std_private_type_traits_decay - export std_private_type_traits_enable_if - export std_private_type_traits_is_base_of - export std_private_type_traits_is_core_convertible - export std_private_type_traits_is_reference_wrapper - export std_private_type_traits_is_same - export std_private_type_traits_is_void - export std_private_type_traits_nat - export std_private_type_traits_remove_cv -} -module std_private_type_traits_is_abstract [system] { header "__type_traits/is_abstract.h" } -module std_private_type_traits_is_aggregate [system] { header "__type_traits/is_aggregate.h" } -module std_private_type_traits_is_allocator [system] { header "__type_traits/is_allocator.h" } -module std_private_type_traits_is_always_bitcastable [system] { header "__type_traits/is_always_bitcastable.h" } -module std_private_type_traits_is_arithmetic [system] { - header "__type_traits/is_arithmetic.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_array [system] { - header "__type_traits/is_array.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_assignable [system] { header "__type_traits/is_assignable.h" } -module std_private_type_traits_is_base_of [system] { header "__type_traits/is_base_of.h" } -module std_private_type_traits_is_bounded_array [system] { header "__type_traits/is_bounded_array.h" } -module std_private_type_traits_is_callable [system] { - header "__type_traits/is_callable.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_char_like_type [system] { header "__type_traits/is_char_like_type.h" } -module std_private_type_traits_is_class [system] { header "__type_traits/is_class.h" } -module std_private_type_traits_is_compound [system] { header "__type_traits/is_compound.h" } -module std_private_type_traits_is_const [system] { header "__type_traits/is_const.h" } -module std_private_type_traits_is_constant_evaluated [system] { header "__type_traits/is_constant_evaluated.h" } -module std_private_type_traits_is_constructible [system] { header "__type_traits/is_constructible.h" } -module std_private_type_traits_is_convertible [system] { - header "__type_traits/is_convertible.h" - export std_private_type_traits_is_array -} -module std_private_type_traits_is_copy_assignable [system] { header "__type_traits/is_copy_assignable.h" } -module std_private_type_traits_is_copy_constructible [system] { header "__type_traits/is_copy_constructible.h" } -module std_private_type_traits_is_core_convertible [system] { - header "__type_traits/is_core_convertible.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_destructible [system] { header "__type_traits/is_destructible.h" } -module std_private_type_traits_is_empty [system] { header "__type_traits/is_empty.h" } -module std_private_type_traits_is_enum [system] { - header "__type_traits/is_enum.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_equality_comparable [system] { - header "__type_traits/is_equality_comparable.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_execution_policy [system] { - header "__type_traits/is_execution_policy.h" - export std_private_type_traits_remove_cvref -} -module std_private_type_traits_is_final [system] { header "__type_traits/is_final.h" } -module std_private_type_traits_is_floating_point [system] { header "__type_traits/is_floating_point.h" } -module std_private_type_traits_is_function [system] { header "__type_traits/is_function.h" } -module std_private_type_traits_is_fundamental [system] { header "__type_traits/is_fundamental.h" } -module std_private_type_traits_is_implicitly_default_constructible [system] { - header "__type_traits/is_implicitly_default_constructible.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_integral [system] { - header "__type_traits/is_integral.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_literal_type [system] { header "__type_traits/is_literal_type.h" } -module std_private_type_traits_is_member_pointer [system] { header "__type_traits/is_member_pointer.h" } -module std_private_type_traits_is_nothrow_assignable [system] { header "__type_traits/is_nothrow_assignable.h" } -module std_private_type_traits_is_nothrow_constructible [system] { - header "__type_traits/is_nothrow_constructible.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_nothrow_convertible [system] { header "__type_traits/is_nothrow_convertible.h" } -module std_private_type_traits_is_nothrow_destructible [system] { - header "__type_traits/is_nothrow_destructible.h" - export std_private_type_traits_is_destructible -} -module std_private_type_traits_is_null_pointer [system] { - header "__type_traits/is_null_pointer.h" - export std_cstddef -} -module std_private_type_traits_is_object [system] { - header "__type_traits/is_object.h" - export std_private_type_traits_is_scalar -} -module std_private_type_traits_is_pod [system] { header "__type_traits/is_pod.h" } -module std_private_type_traits_is_pointer [system] { header "__type_traits/is_pointer.h" } -module std_private_type_traits_is_polymorphic [system] { header "__type_traits/is_polymorphic.h" } -module std_private_type_traits_is_primary_template [system] { - header "__type_traits/is_primary_template.h" - export std_private_type_traits_enable_if -} -module std_private_type_traits_is_reference [system] { - header "__type_traits/is_reference.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_reference_wrapper [system] { header "__type_traits/is_reference_wrapper.h" } -module std_private_type_traits_is_referenceable [system] { header "__type_traits/is_referenceable.h" } -module std_private_type_traits_is_same [system] { - header "__type_traits/is_same.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_scalar [system] { - header "__type_traits/is_scalar.h" - export std_private_type_traits_is_null_pointer -} -module std_private_type_traits_is_signed [system] { header "__type_traits/is_signed.h" } -module std_private_type_traits_is_signed_integer [system] { header "__type_traits/is_signed_integer.h" } -module std_private_type_traits_is_specialization [system] { header "__type_traits/is_specialization.h" } -module std_private_type_traits_is_standard_layout [system] { header "__type_traits/is_standard_layout.h" } -module std_private_type_traits_is_swappable [system] { - header "__type_traits/is_swappable.h" - export std_private_type_traits_is_move_constructible -} -module std_private_type_traits_is_trivial [system] { header "__type_traits/is_trivial.h" } -module std_private_type_traits_is_trivially_assignable [system] { header "__type_traits/is_trivially_assignable.h" } -module std_private_type_traits_is_trivially_constructible [system] { header "__type_traits/is_trivially_constructible.h" } -module std_private_type_traits_is_trivially_copyable [system] { - header "__type_traits/is_trivially_copyable.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_trivially_destructible [system] { header "__type_traits/is_trivially_destructible.h" } -module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" } -module std_private_type_traits_is_trivially_relocatable [system] { header "__type_traits/is_trivially_relocatable.h" } -module std_private_type_traits_is_unbounded_array [system] { header "__type_traits/is_unbounded_array.h" } -module std_private_type_traits_is_union [system] { header "__type_traits/is_union.h" } -module std_private_type_traits_is_unsigned [system] { header "__type_traits/is_unsigned.h" } -module std_private_type_traits_is_unsigned_integer [system] { header "__type_traits/is_unsigned_integer.h" } -module std_private_type_traits_is_valid_expansion [system] { header "__type_traits/is_valid_expansion.h" } -module std_private_type_traits_is_void [system] { - header "__type_traits/is_void.h" - export std_private_type_traits_integral_constant -} -module std_private_type_traits_is_volatile [system] { header "__type_traits/is_volatile.h" } -module std_private_type_traits_lazy [system] { header "__type_traits/lazy.h" } -module std_private_type_traits_make_32_64_or_128_bit [system] { header "__type_traits/make_32_64_or_128_bit.h" } -module std_private_type_traits_make_const_lvalue_ref [system] { header "__type_traits/make_const_lvalue_ref.h" } -module std_private_type_traits_make_signed [system] { header "__type_traits/make_signed.h" } -module std_private_type_traits_make_unsigned [system] { - header "__type_traits/make_unsigned.h" - export std_private_type_traits_is_unsigned -} -module std_private_type_traits_maybe_const [system] { header "__type_traits/maybe_const.h" } -module std_private_type_traits_nat [system] { header "__type_traits/nat.h" } -module std_private_type_traits_negation [system] { header "__type_traits/negation.h" } -module std_private_type_traits_promote [system] { header "__type_traits/promote.h" } -module std_private_type_traits_rank [system] { header "__type_traits/rank.h" } -module std_private_type_traits_remove_all_extents [system] { header "__type_traits/remove_all_extents.h" } -module std_private_type_traits_remove_const [system] { header "__type_traits/remove_const.h" } -module std_private_type_traits_remove_const_ref [system] { header "__type_traits/remove_const_ref.h" } -module std_private_type_traits_remove_cv [system] { - header "__type_traits/remove_cv.h" - export std_private_type_traits_remove_const - export std_private_type_traits_remove_volatile -} -module std_private_type_traits_remove_cvref [system] { header "__type_traits/remove_cvref.h" } -module std_private_type_traits_remove_extent [system] { header "__type_traits/remove_extent.h" } -module std_private_type_traits_remove_pointer [system] { header "__type_traits/remove_pointer.h" } -module std_private_type_traits_remove_reference [system] { header "__type_traits/remove_reference.h" } -module std_private_type_traits_remove_volatile [system] { header "__type_traits/remove_volatile.h" } -module std_private_type_traits_result_of [system] { header "__type_traits/result_of.h" } -module std_private_type_traits_strip_signature [system] { header "__type_traits/strip_signature.h" } -module std_private_type_traits_type_identity [system] { header "__type_traits/type_identity.h" } -module std_private_type_traits_type_list [system] { header "__type_traits/type_list.h" } -module std_private_type_traits_underlying_type [system] { - header "__type_traits/underlying_type.h" - export std_private_type_traits_is_enum -} -module std_private_type_traits_unwrap_ref [system] { header "__type_traits/unwrap_ref.h" } -module std_private_type_traits_void_t [system] { header "__type_traits/void_t.h" } - -module std_private_utility_as_const [system] { header "__utility/as_const.h" } -module std_private_utility_as_lvalue [system] { header "__utility/as_lvalue.h" } -module std_private_utility_auto_cast [system] { - header "__utility/auto_cast.h" - export std_private_type_traits_decay -} -module std_private_utility_cmp [system] { - header "__utility/cmp.h" - export std_private_type_traits_make_unsigned -} -module std_private_utility_convert_to_integral [system] { header "__utility/convert_to_integral.h" } -module std_private_utility_declval [system] { header "__utility/declval.h" } -module std_private_utility_empty [system] { header "__utility/empty.h" } -module std_private_utility_exception_guard [system] { header "__utility/exception_guard.h" } -module std_private_utility_exchange [system] { header "__utility/exchange.h" } -module std_private_utility_forward [system] { header "__utility/forward.h" } -module std_private_utility_forward_like [system] { header "__utility/forward_like.h" } -module std_private_utility_in_place [system] { - header "__utility/in_place.h" - export std_private_type_traits_integral_constant -} -module std_private_utility_integer_sequence [system] { header "__utility/integer_sequence.h" } -module std_private_utility_is_pointer_in_range [system] { header "__utility/is_pointer_in_range.h" } -module std_private_utility_is_valid_range [system] { header "__utility/is_valid_range.h" } -module std_private_utility_move [system] { - header "__utility/move.h" - export std_private_type_traits_is_copy_constructible - export std_private_type_traits_is_nothrow_move_constructible - export std_private_type_traits_remove_reference -} -module std_private_utility_no_destroy [system] { header "__utility/no_destroy.h" } -module std_private_utility_pair [system] { - header "__utility/pair.h" - export std_private_ranges_subrange_fwd - export std_private_tuple_pair_like - export std_private_type_traits_is_assignable - export std_private_type_traits_is_constructible - export std_private_type_traits_is_convertible - export std_private_type_traits_is_copy_assignable - export std_private_type_traits_is_move_assignable - export std_private_type_traits_is_nothrow_copy_constructible - export std_private_type_traits_is_nothrow_default_constructible - export std_private_type_traits_is_nothrow_move_assignable - export std_private_utility_pair_fwd -} -module std_private_utility_pair_fwd [system] { header "__fwd/pair.h" } -module std_private_utility_piecewise_construct [system] { header "__utility/piecewise_construct.h" } -module std_private_utility_priority_tag [system] { header "__utility/priority_tag.h" } -module std_private_utility_private_constructor_tag [system] { header "__utility/private_constructor_tag.h" } -module std_private_utility_rel_ops [system] { header "__utility/rel_ops.h" } -module std_private_utility_small_buffer [system] { header "__utility/small_buffer.h" } -module std_private_utility_swap [system] { - header "__utility/swap.h" - export std_private_type_traits_is_swappable +// This header is used by other C compatibility headers so it needs to be in its own module. +module std_private_mbstate_t [system] { + header "__mbstate_t.h" + export * } -module std_private_utility_to_underlying [system] { header "__utility/to_underlying.h" } -module std_private_utility_unreachable [system] { header "__utility/unreachable.h" } - -module std_private_variant_monostate [system] { header "__variant/monostate.h" } -module std_private_variant_fwd [system] { header "__fwd/variant.h" } - -module std_private_vector_fwd [system] { header "__fwd/vector.h" } diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py index f0421b2e73813..bc028f2a0809a 100644 --- a/libcxx/test/libcxx/clang_modules_include.gen.py +++ b/libcxx/test/libcxx/clang_modules_include.gen.py @@ -37,13 +37,17 @@ // TODO: Investigate this failure // UNSUPPORTED: LIBCXX-FREEBSD-FIXME +// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored +// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME + {lit_header_restrictions.get(header, '')} #include <{header}> """) -print(f"""\ -//--- __std_clang_module.compile.pass.mm +print( + f"""\ +//--- import_std.compile.pass.mm // RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only // REQUIRES: clang-modules-build @@ -61,6 +65,10 @@ // TODO: Investigate this failure // UNSUPPORTED: LIBCXX-FREEBSD-FIXME +// TODO: Investigate why this doesn't work on Picolibc once the locale base API is refactored +// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME + @import std; -""") +""" +) diff --git a/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp b/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp deleted file mode 100644 index 7e27adfab1971..0000000000000 --- a/libcxx/test/std/experimental/utilities/utility/utility.synop/includes.pass.cpp +++ /dev/null @@ -1,23 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// - -#include - -#include "test_macros.h" - -#ifndef _LIBCPP_UTILITY -# error " must include " -#endif - -int main(int, char**) -{ - - return 0; -} diff --git a/libcxx/utils/CMakeLists.txt b/libcxx/utils/CMakeLists.txt index 1116531fa0653..027e485fc15ef 100644 --- a/libcxx/utils/CMakeLists.txt +++ b/libcxx/utils/CMakeLists.txt @@ -2,10 +2,6 @@ add_custom_target(libcxx-generate-feature-test-macros COMMAND "${Python3_EXECUTABLE}" "${LIBCXX_SOURCE_DIR}/utils/generate_feature_test_macro_components.py" COMMENT "Generate the header and tests for feature test macros.") -add_custom_target(libcxx-generate-std-clang-module-header - COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/generate_std_clang_module_header.py" - COMMENT "Generate the <__std_clang_module> header") - add_custom_target(libcxx-generate-std-cppm-in-file COMMAND "${Python3_EXECUTABLE}" @@ -57,7 +53,6 @@ add_custom_target(libcxx-indic-conjunct-break-table add_custom_target(libcxx-generate-files DEPENDS libcxx-generate-feature-test-macros - libcxx-generate-std-clang-module-header libcxx-generate-std-cppm-in-file libcxx-generate-std-compat-cppm-in-file libcxx-generate-extended-grapheme-cluster-tables diff --git a/libcxx/utils/generate_std_clang_module_header.py b/libcxx/utils/generate_std_clang_module_header.py deleted file mode 100644 index 33c9acf395379..0000000000000 --- a/libcxx/utils/generate_std_clang_module_header.py +++ /dev/null @@ -1,63 +0,0 @@ -# ===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ===----------------------------------------------------------------------===## - -import os.path - -import libcxx.header_information - -header_restrictions = libcxx.header_information.header_restrictions - -libcxx_include_directory = os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "include" -) -with open( - os.path.join(libcxx_include_directory, "__std_clang_module"), "w" -) as std_clang_module_header: - std_clang_module_header.write( - """\ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// WARNING, this entire header is generated by -// utils/generate_std_clang_module_header.py -// DO NOT MODIFY! - -// This header should not be directly included, it's exclusively to import all -// of the libc++ public clang modules for the `std` clang module to export. In -// other words, it's to facilitate `@import std;` in Objective-C++ and `import std` -// in Swift to expose all of the libc++ interfaces. This is generally not -// recommended, however there are some clients that need to import all of libc++ -// without knowing what "all" is. -#if !__building_module(std) -# error "Do not include this header directly, include individual headers instead" -#endif - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -""" - ) - # Include the angle brackets in sorting so that sorts before - # like check-format wants. - for include, header in sorted([(f"<{header}>", header) for header in libcxx.header_information.public_headers]): - header_restriction = header_restrictions.get(header) - if header_restriction: - std_clang_module_header.write(f"#if {header_restriction}\n") - std_clang_module_header.write(f"# include {include}\n") - std_clang_module_header.write(f"#endif\n") - else: - std_clang_module_header.write(f"#include {include}\n") From 8ab50da589fd2692052dcb85edf06d1d2d8da42c Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Mon, 30 Sep 2024 11:20:32 -0700 Subject: [PATCH 008/151] Include in Toy Lexer examples. (#110449) These files all use `strtod` - make sure to include a proper header for this function. Otherwise, building MLIR fails on some systems after the recent commit 1b5f6916199ce09244cdb52c6911f2028e6ca95a which removed inclusion of `` and thus broke transitive inclusion of `` in these headers. --- mlir/examples/toy/Ch1/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch2/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch3/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch4/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch5/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch6/include/toy/Lexer.h | 1 + mlir/examples/toy/Ch7/include/toy/Lexer.h | 1 + 7 files changed, 7 insertions(+) diff --git a/mlir/examples/toy/Ch1/include/toy/Lexer.h b/mlir/examples/toy/Ch1/include/toy/Lexer.h index ecbb3b4e0e583..d420a7ebbf3b6 100644 --- a/mlir/examples/toy/Ch1/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch1/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch2/include/toy/Lexer.h b/mlir/examples/toy/Ch2/include/toy/Lexer.h index 3c59cd9805fef..22822cc105957 100644 --- a/mlir/examples/toy/Ch2/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch2/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch3/include/toy/Lexer.h b/mlir/examples/toy/Ch3/include/toy/Lexer.h index 3c59cd9805fef..22822cc105957 100644 --- a/mlir/examples/toy/Ch3/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch3/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch4/include/toy/Lexer.h b/mlir/examples/toy/Ch4/include/toy/Lexer.h index 3c59cd9805fef..22822cc105957 100644 --- a/mlir/examples/toy/Ch4/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch4/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch5/include/toy/Lexer.h b/mlir/examples/toy/Ch5/include/toy/Lexer.h index 3c59cd9805fef..22822cc105957 100644 --- a/mlir/examples/toy/Ch5/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch5/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch6/include/toy/Lexer.h b/mlir/examples/toy/Ch6/include/toy/Lexer.h index 3c59cd9805fef..22822cc105957 100644 --- a/mlir/examples/toy/Ch6/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch6/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include diff --git a/mlir/examples/toy/Ch7/include/toy/Lexer.h b/mlir/examples/toy/Ch7/include/toy/Lexer.h index a3fde91a357a3..f022c2f48ac8e 100644 --- a/mlir/examples/toy/Ch7/include/toy/Lexer.h +++ b/mlir/examples/toy/Ch7/include/toy/Lexer.h @@ -15,6 +15,7 @@ #include "llvm/ADT/StringRef.h" +#include #include #include From 9e85937b835e82846ab8db53586f0844e6783804 Mon Sep 17 00:00:00 2001 From: vporpo Date: Mon, 30 Sep 2024 11:24:55 -0700 Subject: [PATCH 009/151] [SandboxIR][NFC] Rename SandboxIRValues.def to Values.def (#110538) --- llvm/include/llvm/SandboxIR/Constant.h | 2 +- llvm/include/llvm/SandboxIR/Context.h | 2 +- llvm/include/llvm/SandboxIR/Instruction.h | 4 ++-- llvm/include/llvm/SandboxIR/Type.h | 4 ++-- llvm/include/llvm/SandboxIR/User.h | 2 +- llvm/include/llvm/SandboxIR/Value.h | 6 +++--- .../llvm/SandboxIR/{SandboxIRValues.def => Values.def} | 2 +- llvm/lib/SandboxIR/Instruction.cpp | 4 ++-- llvm/lib/SandboxIR/User.cpp | 2 +- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 2 +- 10 files changed, 15 insertions(+), 15 deletions(-) rename llvm/include/llvm/SandboxIR/{SandboxIRValues.def => Values.def} (98%) diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h index 7965f947e31b8..6eafdefde6026 100644 --- a/llvm/include/llvm/SandboxIR/Constant.h +++ b/llvm/include/llvm/SandboxIR/Constant.h @@ -46,7 +46,7 @@ class Constant : public sandboxir::User { static bool classof(const sandboxir::Value *From) { switch (From->getSubclassID()) { #define DEF_CONST(ID, CLASS) case ClassID::ID: -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" return true; default: return false; diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index acfffd9ccd4a7..77924fbcd5ace 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -71,7 +71,7 @@ class Context { // Friends for getOrCreateConstant(). #define DEF_CONST(ID, CLASS) friend class CLASS; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will /// also create all contents of the block. diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index f5f5bb5c4443c..a34573a0bc1b0 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -26,7 +26,7 @@ class Instruction : public User { #define OP(OPC) OPC, #define OPCODES(...) __VA_ARGS__ #define DEF_INSTR(ID, OPC, CLASS) OPC -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" }; protected: @@ -365,7 +365,7 @@ template class SingleLLVMInstructionImpl : public Instruction { // All instructions are friends with this so they can call the constructor. #define DEF_INSTR(ID, OPC, CLASS) friend class CLASS; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" friend class UnaryInstruction; friend class CallBase; friend class FuncletPadInst; diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 94ea4652c72c8..8094f66567fb8 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -35,7 +35,7 @@ class TargetExtType; class Module; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" /// Just like llvm::Type these are immutable, unique, never get freed and can /// only be created via static factory methods. @@ -65,7 +65,7 @@ class Type { // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; #define DEF_CONST(ID, CLASS) friend class CLASS; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" Context &Ctx; Type(llvm::Type *LLVMTy, Context &Ctx) : LLVMTy(LLVMTy), Ctx(Ctx) {} diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h index 5e47ba5e727f4..80e672de34905 100644 --- a/llvm/include/llvm/SandboxIR/User.h +++ b/llvm/include/llvm/SandboxIR/User.h @@ -26,7 +26,7 @@ class OperandUseIterator { OperandUseIterator(const class Use &Use) : Use(Use) {} friend class User; // For constructor #define DEF_INSTR(ID, OPC, CLASS) friend class CLASS; // For constructor -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" public: using difference_type = std::ptrdiff_t; diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index e7d516f38156c..b28f0e664f80b 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -18,7 +18,7 @@ namespace llvm::sandboxir { #define DEF_INSTR(ID, OPC, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; #define DEF_USER(ID, CLASS) class CLASS; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" class Context; class FuncletPadInst; class Type; @@ -63,7 +63,7 @@ class Value { #define DEF_USER(ID, CLASS) ID, #define DEF_CONST(ID, CLASS) ID, #define DEF_INSTR(ID, OPC, CLASS) ID, -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" }; protected: @@ -81,7 +81,7 @@ class Value { #define DEF_INSTR(ID, OPC, CLASS) \ case ClassID::ID: \ return #ID; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" } llvm_unreachable("Unimplemented ID"); } diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/Values.def similarity index 98% rename from llvm/include/llvm/SandboxIR/SandboxIRValues.def rename to llvm/include/llvm/SandboxIR/Values.def index 2a9ca6d3d73ce..3d8ad6ce197f4 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/Values.def @@ -1,4 +1,4 @@ -//===- SandboxIRValues.def --------------------------------------*- C++ -*-===// +//===- Values.def -----------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index b492af893794f..276c4f0872b10 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -18,7 +18,7 @@ const char *Instruction::getOpcodeName(Opcode Opc) { return #OPC; #define OPCODES(...) __VA_ARGS__ #define DEF_INSTR(ID, OPC, CLASS) OPC -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" } llvm_unreachable("Unknown Opcode"); } @@ -173,7 +173,7 @@ bool Instruction::classof(const sandboxir::Value *From) { #define DEF_INSTR(ID, OPC, CLASS) \ case ClassID::ID: \ return true; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" default: return false; } diff --git a/llvm/lib/SandboxIR/User.cpp b/llvm/lib/SandboxIR/User.cpp index 148d75199439a..d7e4656e6e90e 100644 --- a/llvm/lib/SandboxIR/User.cpp +++ b/llvm/lib/SandboxIR/User.cpp @@ -82,7 +82,7 @@ bool User::classof(const Value *From) { #define DEF_INSTR(ID, OPC, CLASS) \ case ClassID::ID: \ return true; -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" default: return false; } diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 3bd520f3174c2..7206ee34d36e3 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -6114,5 +6114,5 @@ define void @foo() { TEST_F(SandboxIRTest, CheckClassof) { #define DEF_INSTR(ID, OPC, CLASS) \ EXPECT_NE(&sandboxir::CLASS::classof, &sandboxir::Instruction::classof); -#include "llvm/SandboxIR/SandboxIRValues.def" +#include "llvm/SandboxIR/Values.def" } From 18fa9fa0439d483060cee42412926565838822d4 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Mon, 30 Sep 2024 20:26:55 +0200 Subject: [PATCH 010/151] [LLD][COFF] Add support for ARM64EC delay-load imports (#110042) Fill the regular delay-load IAT with x86_64 delay-load thunks. Similarly to regular imports, create an auxiliary IAT and its copy for ARM64EC calls. These are filled with the same `__impchk_` thunks used for regular imports, which perform an indirect call with `__icall_helper_arm64ec` on the regular delay-load IAT. These auxiliary IATs are exposed via CHPE metadata starting from version 2. The MSVC linker creates one more copy of the auxiliary IAT. `__imp_func` symbols refer to that hidden IAT, while the `#func` thunk performs a call with the public auxiliary IAT. If the public auxiliary IAT is fine for `#func`, it should be fine for calls using the `__imp_func` symbol as well. Therefore, I made `__imp_func` refer to that IAT too. --- lld/COFF/DLL.cpp | 16 ++ lld/COFF/DLL.h | 4 + lld/COFF/Driver.cpp | 2 + lld/COFF/Writer.cpp | 23 +++ lld/test/COFF/Inputs/loadconfig-arm64ec.s | 4 +- lld/test/COFF/arm64ec-delayimport.test | 201 ++++++++++++++++++++++ 6 files changed, 248 insertions(+), 2 deletions(-) create mode 100644 lld/test/COFF/arm64ec-delayimport.test diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index 39dcce9fe8483..2d20b094888c7 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -812,6 +812,16 @@ void DelayLoadContents::create(Defined *h) { s->loadThunkSym = cast(ctx.symtab.addSynthetic(symName, t)); } + + if (s->file->impECSym) { + auto chunk = make(s->file); + auxIat.push_back(chunk); + s->file->impECSym->setLocation(chunk); + + chunk = make(s->file); + auxIatCopy.push_back(chunk); + s->file->auxImpCopySym->setLocation(chunk); + } } thunks.push_back(tm); if (pdataChunk) @@ -822,6 +832,10 @@ void DelayLoadContents::create(Defined *h) { // Terminate with null values. addresses.push_back(make(8)); names.push_back(make(8)); + if (ctx.config.machine == ARM64EC) { + auxIat.push_back(make(8)); + auxIatCopy.push_back(make(8)); + } for (int i = 0, e = syms.size(); i < e; ++i) syms[i]->setLocation(addresses[base + i]); @@ -845,6 +859,7 @@ void DelayLoadContents::create(Defined *h) { Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) { switch (ctx.config.machine) { case AMD64: + case ARM64EC: return make(dir, helper); case I386: return make(ctx, dir, helper); @@ -880,6 +895,7 @@ Chunk *DelayLoadContents::newThunkChunk(DefinedImportData *s, Chunk *tailMerge) { switch (ctx.config.machine) { case AMD64: + case ARM64EC: return make(s, tailMerge); case I386: return make(ctx, s, tailMerge); diff --git a/lld/COFF/DLL.h b/lld/COFF/DLL.h index afb46f22ec9e1..f7d2b57a20a02 100644 --- a/lld/COFF/DLL.h +++ b/lld/COFF/DLL.h @@ -48,6 +48,8 @@ class DelayLoadContents { ArrayRef getCodeChunks() { return thunks; } ArrayRef getCodePData() { return pdata; } ArrayRef getCodeUnwindInfo() { return unwindinfo; } + ArrayRef getAuxIat() { return auxIat; } + ArrayRef getAuxIatCopy() { return auxIatCopy; } uint64_t getDirRVA() { return dirs[0]->getRVA(); } uint64_t getDirSize(); @@ -69,6 +71,8 @@ class DelayLoadContents { std::vector pdata; std::vector unwindinfo; std::vector dllNames; + std::vector auxIat; + std::vector auxIatCopy; COFFLinkerContext &ctx; }; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 5a6a4a61030e6..6a880b64c5858 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2465,6 +2465,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { ctx.symtab.addAbsolute("__arm64x_extra_rfe_table_size", 0); ctx.symtab.addAbsolute("__arm64x_redirection_metadata", 0); ctx.symtab.addAbsolute("__arm64x_redirection_metadata_count", 0); + ctx.symtab.addAbsolute("__hybrid_auxiliary_delayload_iat_copy", 0); + ctx.symtab.addAbsolute("__hybrid_auxiliary_delayload_iat", 0); ctx.symtab.addAbsolute("__hybrid_auxiliary_iat", 0); ctx.symtab.addAbsolute("__hybrid_auxiliary_iat_copy", 0); ctx.symtab.addAbsolute("__hybrid_code_map", 0); diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index efab7d3e83709..71ee5ce468555 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -958,6 +958,13 @@ void Writer::appendECImportTables() { auxIat->chunks.end()); rdataSec->addContributingPartialSection(auxIat); } + + if (!delayIdata.getAuxIat().empty()) { + delayIdata.getAuxIat().front()->setAlignment(0x1000); + rdataSec->chunks.insert(rdataSec->chunks.end(), + delayIdata.getAuxIat().begin(), + delayIdata.getAuxIat().end()); + } } // Locate the first Chunk and size of the import directory list and the @@ -1294,6 +1301,8 @@ void Writer::appendImportThunks() { textSec->addChunk(c); for (Chunk *c : delayIdata.getCodePData()) pdataSec->addChunk(c); + for (Chunk *c : delayIdata.getAuxIatCopy()) + rdataSec->addChunk(c); for (Chunk *c : delayIdata.getCodeUnwindInfo()) rdataSec->addChunk(c); } @@ -2295,6 +2304,20 @@ void Writer::setECSymbols() { replaceSymbol( iatCopySym, "__hybrid_auxiliary_iat_copy", idata.auxIatCopy.empty() ? nullptr : idata.auxIatCopy.front()); + + Symbol *delayIatSym = + ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat"); + replaceSymbol( + delayIatSym, "__hybrid_auxiliary_delayload_iat", + delayIdata.getAuxIat().empty() ? nullptr + : delayIdata.getAuxIat().front()); + + Symbol *delayIatCopySym = + ctx.symtab.findUnderscore("__hybrid_auxiliary_delayload_iat_copy"); + replaceSymbol( + delayIatCopySym, "__hybrid_auxiliary_delayload_iat_copy", + delayIdata.getAuxIatCopy().empty() ? nullptr + : delayIdata.getAuxIatCopy().front()); } // Write section contents to a mmap'ed file. diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s index 80ec893869e6f..26bcc66853f78 100644 --- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s +++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s @@ -79,8 +79,8 @@ __chpe_metadata: .word __arm64x_extra_rfe_table_size .rva __os_arm64x_dispatch_fptr .rva __hybrid_auxiliary_iat_copy - .word 0 // __hybrid_auxiliary_delayload_iat - .word 0 // __hybrid_auxiliary_delayload_iat_copy + .rva __hybrid_auxiliary_delayload_iat + .rva __hybrid_auxiliary_delayload_iat_copy .word 0 // __hybrid_image_info_bitfield .rva __os_arm64x_helper3 .rva __os_arm64x_helper4 diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test new file mode 100644 index 0000000000000..a0236d902eeab --- /dev/null +++ b/lld/test/COFF/arm64ec-delayimport.test @@ -0,0 +1,201 @@ +REQUIRES: aarch64, x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj +RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib +RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib + +RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj test.obj \ +RUN: test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll -map + +RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s +TESTSEC: 0x180008000 00600000 88700000 00200000 10100000 +TESTSEC-NEXT: 0x180008010 08600000 90700000 10200000 30100000 +TESTSEC-NEXT: 0x180008020 1c100000 3c100000 00300000 + +RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s +DISASM: 0000000180001000 <.text>: +DISASM-NEXT: 80001000: 52800000 mov w0, #0x0 // =0 +DISASM-NEXT: 180001004: d65f03c0 ret +DISASM-NEXT: 180001008: 52800020 mov w0, #0x1 // =1 +DISASM-NEXT: 18000100c: d65f03c0 ret +DISASM-NEXT: 180001010: b0000030 adrp x16, 0x180006000 +DISASM-NEXT: 180001014: f9400210 ldr x16, [x16] +DISASM-NEXT: 180001018: d61f0200 br x16 +DISASM-NEXT: 18000101c: d000002b adrp x11, 0x180007000 +DISASM-NEXT: 180001020: f940456b ldr x11, [x11, #0x88] +DISASM-NEXT: 180001024: 9000000a adrp x10, 0x180001000 <.text> +DISASM-NEXT: 180001028: 9101414a add x10, x10, #0x50 +DISASM-NEXT: 18000102c: 17fffff5 b 0x180001000 <.text> +DISASM-NEXT: 180001030: b0000030 adrp x16, 0x180006000 +DISASM-NEXT: 180001034: f9400610 ldr x16, [x16, #0x8] +DISASM-NEXT: 180001038: d61f0200 br x16 +DISASM-NEXT: 18000103c: d000002b adrp x11, 0x180007000 +DISASM-NEXT: 180001040: f940496b ldr x11, [x11, #0x90] +DISASM-NEXT: 180001044: 9000000a adrp x10, 0x180001000 <.text> +DISASM-NEXT: 180001048: 9101614a add x10, x10, #0x58 +DISASM-NEXT: 18000104c: 17ffffed b 0x180001000 <.text> +DISASM-NEXT: 180001050: 52800040 mov w0, #0x2 // =2 +DISASM-NEXT: 180001054: d65f03c0 ret +DISASM-NEXT: 180001058: 52800060 mov w0, #0x3 // =3 +DISASM-NEXT: 18000105c: d65f03c0 ret +DISASM-NEXT: ... +DISASM-NEXT: 180002000: ff 25 82 50 00 00 jmpq *0x5082(%rip) # 0x180007088 +DISASM-NEXT: ... +DISASM-NEXT: 18000200e: 00 00 addb %al, (%rax) +DISASM-NEXT: 180002010: ff 25 7a 50 00 00 jmpq *0x507a(%rip) # 0x180007090 +DISASM-NEXT: 180002016: 48 8d 05 6b 50 00 00 leaq 0x506b(%rip), %rax # 0x180007088 +DISASM-NEXT: 18000201d: e9 0c 00 00 00 jmp 0x18000202e <.text+0x102e> +DISASM-NEXT: 180002022: 48 8d 05 67 50 00 00 leaq 0x5067(%rip), %rax # 0x180007090 +DISASM-NEXT: 180002029: e9 00 00 00 00 jmp 0x18000202e <.text+0x102e> +DISASM-NEXT: 18000202e: 51 pushq %rcx +DISASM-NEXT: 18000202f: 52 pushq %rdx +DISASM-NEXT: 180002030: 41 50 pushq %r8 +DISASM-NEXT: 180002032: 41 51 pushq %r9 +DISASM-NEXT: 180002034: 48 83 ec 48 subq $0x48, %rsp +DISASM-NEXT: 180002038: 66 0f 7f 04 24 movdqa %xmm0, (%rsp) +DISASM-NEXT: 18000203d: 66 0f 7f 4c 24 10 movdqa %xmm1, 0x10(%rsp) +DISASM-NEXT: 180002043: 66 0f 7f 54 24 20 movdqa %xmm2, 0x20(%rsp) +DISASM-NEXT: 180002049: 66 0f 7f 5c 24 30 movdqa %xmm3, 0x30(%rsp) +DISASM-NEXT: 18000204f: 48 8b d0 movq %rax, %rdx +DISASM-NEXT: 180002052: 48 8d 0d 97 21 00 00 leaq 0x2197(%rip), %rcx # 0x1800041f0 +DISASM-NEXT: 180002059: e8 aa ef ff ff callq 0x180001008 <.text+0x8> +DISASM-NEXT: 18000205e: 66 0f 6f 04 24 movdqa (%rsp), %xmm0 +DISASM-NEXT: 180002063: 66 0f 6f 4c 24 10 movdqa 0x10(%rsp), %xmm1 +DISASM-NEXT: 180002069: 66 0f 6f 54 24 20 movdqa 0x20(%rsp), %xmm2 +DISASM-NEXT: 18000206f: 66 0f 6f 5c 24 30 movdqa 0x30(%rsp), %xmm3 +DISASM-NEXT: 180002075: 48 83 c4 48 addq $0x48, %rsp +DISASM-NEXT: 180002079: 41 59 popq %r9 +DISASM-NEXT: 18000207b: 41 58 popq %r8 +DISASM-NEXT: 18000207d: 5a popq %rdx +DISASM-NEXT: 18000207e: 59 popq %rcx +DISASM-NEXT: 18000207f: ff e0 jmpq *%rax + +RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s +LOADCFG: CHPEMetadata [ +LOADCFG: AuxiliaryDelayloadIAT: 0x6000 +LOADCFG-NEXT: AuxiliaryDelayloadIATCopy: 0x4000 + +RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s +IMPORTS: DelayImport { +IMPORTS-NEXT: Name: test.dll +IMPORTS-NEXT: Attributes: 0x1 +IMPORTS-NEXT: ModuleHandle: 0x7080 +IMPORTS-NEXT: ImportAddressTable: 0x7088 +IMPORTS-NEXT: ImportNameTable: 0x4230 +IMPORTS-NEXT: BoundDelayImportTable: 0x0 +IMPORTS-NEXT: UnloadDelayImportTable: 0x0 +IMPORTS-NEXT: Import { +IMPORTS-NEXT: Symbol: func (0) +IMPORTS-NEXT: Address: 0x180002016 +IMPORTS-NEXT: } +IMPORTS-NEXT: Import { +IMPORTS-NEXT: Symbol: func2 (0) +IMPORTS-NEXT: Address: 0x180002022 +IMPORTS-NEXT: } +IMPORTS-NEXT: } + +RUN: FileCheck --check-prefix=MAP %s < out.map +MAP: 0001:00000008 #__delayLoadHelper2 0000000180001008 test.obj +MAP: 0001:00000010 #func 0000000180001010 test-arm64ec:test.dll +MAP-NEXT: 0001:0000001c __impchk_func 000000018000101c test-arm64ec:test.dll +MAP-NEXT: 0001:00000030 #func2 0000000180001030 test-arm64ec:test.dll +MAP-NEXT: 0001:0000003c __impchk_func2 000000018000103c test-arm64ec:test.dll +MAP-NEXT: 0001:00000050 func_exit_thunk 0000000180001050 test.obj +MAP-NEXT: 0001:00000058 func2_exit_thunk 0000000180001058 test.obj +MAP-NEXT: 0001:00001000 func 0000000180002000 test-arm64ec:test.dll +MAP-NEXT: 0001:00001010 func2 0000000180002010 test-arm64ec:test.dll +MAP-NEXT: 0002:00000000 __imp_data 0000000180003000 test2-arm64ec:test2.dll +MAP-NEXT: 0000:00000000 __hybrid_auxiliary_delayload_iat_copy 0000000180004000 +MAP-NEXT: 0002:00001000 __auximpcopy_func 0000000180004000 test-arm64ec:test.dll +MAP-NEXT: 0002:00001008 __auximpcopy_func2 0000000180004008 test-arm64ec:test.dll +MAP: 0002:00003000 __imp_func 0000000180006000 test-arm64ec:test.dll +MAP-NEXT: 0002:00003008 __imp_func2 0000000180006008 test-arm64ec:test.dll +MAP: 0003:00000088 __imp_aux_func 0000000180007088 test-arm64ec:test.dll +MAP-NEXT: 0003:00000090 __imp_aux_func2 0000000180007090 test-arm64ec:test.dll + +RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck --check-prefix=RDATA %s +RDATA: 0x180004000 1c100080 01000000 3c100080 01000000 +RDATA-NEXT: 0x180004010 00000000 00000000 +RDATA: 0x180006000 1c100080 01000000 3c100080 01000000 +RDATA-NEXT: 0x180006010 00000000 00000000 + +RUN: llvm-readobj --coff-basereloc out.dll | FileCheck --check-prefix=RELOC %s +RELOC: BaseReloc [ +RELOC-NEXT: Entry { +RELOC-NEXT: Type: DIR64 +RELOC-NEXT: Address: 0x4000 +RELOC-NEXT: } +RELOC-NEXT: Entry { +RELOC-NEXT: Type: DIR64 +RELOC-NEXT: Address: 0x4008 +RELOC-NEXT: } +RELOC: Address: 0x6000 +RELOC-NEXT: } +RELOC-NEXT: Entry { +RELOC-NEXT: Type: DIR64 +RELOC-NEXT: Address: 0x6008 +RELOC-NEXT: } + +#--- test.s + .section .test,"r" + .rva __imp_func + .rva __imp_aux_func + .rva func + .rva "#func" + .rva __imp_func2 + .rva __imp_aux_func2 + .rva func2 + .rva "#func2" + .rva __impchk_func + .rva __impchk_func2 + .rva __imp_data + + .section .text,"xr",discard,__icall_helper_arm64ec + .globl __icall_helper_arm64ec + .p2align 2, 0x0 +__icall_helper_arm64ec: + mov w0, #0 + ret + + .section .text,"xr",discard,"#__delayLoadHelper2" + .globl "#__delayLoadHelper2" + .p2align 2, 0x0 +"#__delayLoadHelper2": + mov w0, #1 + ret + + .weak_anti_dep __delayLoadHelper2 +.set __delayLoadHelper2,"#__delayLoadHelper2" + + .section .hybmp$x, "yi" + .symidx __imp_func + .symidx func_exit_thunk + .word 4 + .symidx __imp_func2 + .symidx func2_exit_thunk + .word 4 + + .section .wowthk$aa,"xr",discard,func_exit_thunk + .globl func_exit_thunk +func_exit_thunk: + mov w0, #2 + ret + + .section .wowthk$aa,"xr",discard,func2_exit_thunk + .globl func2_exit_thunk +func2_exit_thunk: + mov w0, #3 + ret + +#--- test.def +NAME test.dll +EXPORTS + func + func2 + +#--- test2.def +NAME test2.dll +EXPORTS + data DATA From ab393cee9dffdb225b94badcb9c21f80b156b74b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 30 Sep 2024 11:44:23 -0700 Subject: [PATCH 011/151] [RISCV] Take known minimum vlen into account when calculating alignment padding in assignRVVStackObjectOffsets. (#110312) If we know vlen is a multiple of 16, we don't need any alignment padding. I wrote the code so that it would generate the minimum amount of padding if the stack align was 32 or larger or if RVVBitsPerBlock was smaller than half the stack alignment. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 11 +- .../early-clobber-tied-def-subreg-liveness.ll | 10 +- llvm/test/CodeGen/RISCV/rvv-cfi-info.ll | 53 ++-- .../RISCV/rvv/access-fixed-objects-by-rvv.ll | 4 +- .../RISCV/rvv/addi-scalable-offset.mir | 2 - .../rvv/alloca-load-store-scalable-array.ll | 8 +- .../CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll | 255 ++++++++++++------ .../CodeGen/RISCV/rvv/callee-saved-regs.ll | 2 - .../test/CodeGen/RISCV/rvv/emergency-slot.mir | 2 +- .../rvv/fixed-vectors-fp-buildvec-bf16.ll | 8 +- .../RISCV/rvv/fixed-vectors-fp-buildvec.ll | 8 +- .../rvv/fixed-vectors-interleaved-access.ll | 10 +- .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll | 16 +- .../RISCV/rvv/fixed-vectors-vfwmaccbf16.ll | 16 +- .../RISCV/rvv/fixed-vectors-vpscatter.ll | 20 +- llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 26 +- llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 26 +- .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll | 96 +++---- llvm/test/CodeGen/RISCV/rvv/frm-insert.ll | 20 -- .../CodeGen/RISCV/rvv/no-reserved-frame.ll | 1 - .../CodeGen/RISCV/rvv/rv32-spill-vector.ll | 8 - .../CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll | 4 - .../CodeGen/RISCV/rvv/rv64-spill-vector.ll | 4 - .../CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll | 4 - .../CodeGen/RISCV/rvv/scalar-stack-align.ll | 107 +++++--- llvm/test/CodeGen/RISCV/rvv/stack-folding.ll | 32 +-- llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll | 28 +- llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll | 28 +- llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 70 ++--- llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll | 14 +- llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll | 28 +- .../CodeGen/RISCV/rvv/vpscatter-sdnode.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll | 4 - .../rvv/wrong-stack-offset-for-rvv-object.mir | 29 +- 34 files changed, 473 insertions(+), 491 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 22824b77c37dd..b0c525ea8c299 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1102,16 +1102,25 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const { RVVStackAlign = std::max(RVVStackAlign, ObjectAlign); } + uint64_t StackSize = Offset; + + // Multiply by vscale. + if (ST.getRealMinVLen() >= RISCV::RVVBitsPerBlock) + StackSize *= ST.getRealMinVLen() / RISCV::RVVBitsPerBlock; + // Ensure the alignment of the RVV stack. Since we want the most-aligned // object right at the bottom (i.e., any padding at the top of the frame), // readjust all RVV objects down by the alignment padding. - uint64_t StackSize = Offset; if (auto AlignmentPadding = offsetToAlignment(StackSize, RVVStackAlign)) { StackSize += AlignmentPadding; for (int FI : ObjectsToAllocate) MFI.setObjectOffset(FI, MFI.getObjectOffset(FI) - AlignmentPadding); } + // Remove vscale. + if (ST.getRealMinVLen() >= RISCV::RVVBitsPerBlock) + StackSize /= ST.getRealMinVLen() / RISCV::RVVBitsPerBlock; + return std::make_pair(StackSize, RVVStackAlign); } diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 899aad6ed7232..0c2b809c0be20 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -17,10 +17,10 @@ define void @_Z3foov() { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 10 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma @@ -83,8 +83,8 @@ define void @_Z3foov() { ; CHECK-NEXT: addi a0, a0, %lo(var_47) ; CHECK-NEXT: vsseg4e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 10 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll index 93fe66695b70e..225680e846bac 100644 --- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll +++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll @@ -10,9 +10,10 @@ define riscv_vector_cc @test_vector_callee_cfi( @test_vector_callee_cfi( @test_vector_callee_cfi( @test_vector_callee_cfi( @access_fixed_and_vector_objects(ptr %val) { ; RV64IV-NEXT: addi sp, sp, -528 ; RV64IV-NEXT: .cfi_def_cfa_offset 528 ; RV64IV-NEXT: csrr a0, vlenb -; RV64IV-NEXT: slli a0, a0, 1 ; RV64IV-NEXT: sub sp, sp, a0 -; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 2 * vlenb +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb ; RV64IV-NEXT: addi a0, sp, 8 ; RV64IV-NEXT: vl1re64.v v8, (a0) ; RV64IV-NEXT: addi a0, sp, 528 @@ -44,7 +43,6 @@ define @access_fixed_and_vector_objects(ptr %val) { ; RV64IV-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64IV-NEXT: vadd.vv v8, v8, v9 ; RV64IV-NEXT: csrr a0, vlenb -; RV64IV-NEXT: slli a0, a0, 1 ; RV64IV-NEXT: add sp, sp, a0 ; RV64IV-NEXT: addi sp, sp, 528 ; RV64IV-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir index f976adcfe931c..5f0e1a9b9aa24 100644 --- a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir +++ b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir @@ -38,12 +38,10 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $x8, 0 ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -240 ; CHECK-NEXT: $x12 = frame-setup PseudoReadVLENB - ; CHECK-NEXT: $x12 = frame-setup SLLI killed $x12, 1 ; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x12 ; CHECK-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x11, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: renamable $v8 = PseudoVLE64_V_M1 undef renamable $v8, killed renamable $x10, $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load unknown-size from %ir.pa, align 8) ; CHECK-NEXT: $x10 = PseudoReadVLENB - ; CHECK-NEXT: $x10 = SLLI killed $x10, 1 ; CHECK-NEXT: $x10 = SUB $x8, killed $x10 ; CHECK-NEXT: $x10 = ADDI killed $x10, -2048 ; CHECK-NEXT: $x10 = ADDI killed $x10, -224 diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll index 1fe91c721f4dd..2e70c3395090e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll @@ -11,9 +11,10 @@ define void @test(ptr %addr) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrrs a1, vlenb, zero -; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb ; CHECK-NEXT: csrrs a1, vlenb, zero ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: vl1re64.v v8, (a2) @@ -28,7 +29,8 @@ define void @test(ptr %addr) { ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs1r.v v8, (a0) ; CHECK-NEXT: csrrs a0, vlenb, zero -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: jalr zero, 0(ra) diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll index 90794820ddd84..35e269b911902 100644 --- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll +++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll @@ -7,34 +7,13 @@ ; RUN: | FileCheck %s --check-prefixes=CHECK,NOMUL define void @lmul1() nounwind { -; NOZBA-LABEL: lmul1: -; NOZBA: # %bb.0: -; NOZBA-NEXT: csrr a0, vlenb -; NOZBA-NEXT: slli a0, a0, 1 -; NOZBA-NEXT: sub sp, sp, a0 -; NOZBA-NEXT: csrr a0, vlenb -; NOZBA-NEXT: slli a0, a0, 1 -; NOZBA-NEXT: add sp, sp, a0 -; NOZBA-NEXT: ret -; -; ZBA-LABEL: lmul1: -; ZBA: # %bb.0: -; ZBA-NEXT: csrr a0, vlenb -; ZBA-NEXT: slli a0, a0, 1 -; ZBA-NEXT: sub sp, sp, a0 -; ZBA-NEXT: csrr a0, vlenb -; ZBA-NEXT: sh1add sp, a0, sp -; ZBA-NEXT: ret -; -; NOMUL-LABEL: lmul1: -; NOMUL: # %bb.0: -; NOMUL-NEXT: csrr a0, vlenb -; NOMUL-NEXT: slli a0, a0, 1 -; NOMUL-NEXT: sub sp, sp, a0 -; NOMUL-NEXT: csrr a0, vlenb -; NOMUL-NEXT: slli a0, a0, 1 -; NOMUL-NEXT: add sp, sp, a0 -; NOMUL-NEXT: ret +; CHECK-LABEL: lmul1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ret %v = alloca ret void } @@ -192,29 +171,34 @@ define void @lmul2_and_1() nounwind { ; NOZBA-LABEL: lmul2_and_1: ; NOZBA: # %bb.0: ; NOZBA-NEXT: csrr a0, vlenb -; NOZBA-NEXT: slli a0, a0, 2 +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 ; NOZBA-NEXT: sub sp, sp, a0 ; NOZBA-NEXT: csrr a0, vlenb -; NOZBA-NEXT: slli a0, a0, 2 +; NOZBA-NEXT: slli a1, a0, 1 +; NOZBA-NEXT: add a0, a1, a0 ; NOZBA-NEXT: add sp, sp, a0 ; NOZBA-NEXT: ret ; ; ZBA-LABEL: lmul2_and_1: ; ZBA: # %bb.0: ; ZBA-NEXT: csrr a0, vlenb -; ZBA-NEXT: slli a0, a0, 2 +; ZBA-NEXT: sh1add a0, a0, a0 ; ZBA-NEXT: sub sp, sp, a0 ; ZBA-NEXT: csrr a0, vlenb -; ZBA-NEXT: sh2add sp, a0, sp +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: add sp, sp, a0 ; ZBA-NEXT: ret ; ; NOMUL-LABEL: lmul2_and_1: ; NOMUL: # %bb.0: ; NOMUL-NEXT: csrr a0, vlenb -; NOMUL-NEXT: slli a0, a0, 2 +; NOMUL-NEXT: slli a1, a0, 1 +; NOMUL-NEXT: add a0, a1, a0 ; NOMUL-NEXT: sub sp, sp, a0 ; NOMUL-NEXT: csrr a0, vlenb -; NOMUL-NEXT: slli a0, a0, 2 +; NOMUL-NEXT: slli a1, a0, 1 +; NOMUL-NEXT: add a0, a1, a0 ; NOMUL-NEXT: add sp, sp, a0 ; NOMUL-NEXT: ret %v1 = alloca @@ -223,63 +207,176 @@ define void @lmul2_and_1() nounwind { } define void @lmul4_and_1() nounwind { -; CHECK-LABEL: lmul4_and_1: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -48 -; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 48 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: addi sp, s0, -48 -; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 48 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul4_and_1: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -48 +; NOZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 48 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: li a1, 6 +; NOZBA-NEXT: mul a0, a0, a1 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: addi sp, s0, -48 +; NOZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 48 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul4_and_1: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -48 +; ZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 48 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: slli a0, a0, 1 +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: addi sp, s0, -48 +; ZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 48 +; ZBA-NEXT: ret +; +; NOMUL-LABEL: lmul4_and_1: +; NOMUL: # %bb.0: +; NOMUL-NEXT: addi sp, sp, -48 +; NOMUL-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOMUL-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOMUL-NEXT: addi s0, sp, 48 +; NOMUL-NEXT: csrr a0, vlenb +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: mv a1, a0 +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: add a0, a0, a1 +; NOMUL-NEXT: sub sp, sp, a0 +; NOMUL-NEXT: andi sp, sp, -32 +; NOMUL-NEXT: addi sp, s0, -48 +; NOMUL-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOMUL-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOMUL-NEXT: addi sp, sp, 48 +; NOMUL-NEXT: ret %v1 = alloca %v2 = alloca ret void } define void @lmul4_and_2() nounwind { -; CHECK-LABEL: lmul4_and_2: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -48 -; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 48 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: addi sp, s0, -48 -; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 48 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul4_and_2: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -48 +; NOZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 48 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: li a1, 6 +; NOZBA-NEXT: mul a0, a0, a1 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: addi sp, s0, -48 +; NOZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 48 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul4_and_2: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -48 +; ZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 48 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: slli a0, a0, 1 +; ZBA-NEXT: sh1add a0, a0, a0 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: addi sp, s0, -48 +; ZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 48 +; ZBA-NEXT: ret +; +; NOMUL-LABEL: lmul4_and_2: +; NOMUL: # %bb.0: +; NOMUL-NEXT: addi sp, sp, -48 +; NOMUL-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOMUL-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOMUL-NEXT: addi s0, sp, 48 +; NOMUL-NEXT: csrr a0, vlenb +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: mv a1, a0 +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: add a0, a0, a1 +; NOMUL-NEXT: sub sp, sp, a0 +; NOMUL-NEXT: andi sp, sp, -32 +; NOMUL-NEXT: addi sp, s0, -48 +; NOMUL-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOMUL-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOMUL-NEXT: addi sp, sp, 48 +; NOMUL-NEXT: ret %v1 = alloca %v2 = alloca ret void } define void @lmul4_and_2_x2_0() nounwind { -; CHECK-LABEL: lmul4_and_2_x2_0: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -48 -; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: addi s0, sp, 48 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: andi sp, sp, -32 -; CHECK-NEXT: addi sp, s0, -48 -; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 48 -; CHECK-NEXT: ret +; NOZBA-LABEL: lmul4_and_2_x2_0: +; NOZBA: # %bb.0: +; NOZBA-NEXT: addi sp, sp, -48 +; NOZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOZBA-NEXT: addi s0, sp, 48 +; NOZBA-NEXT: csrr a0, vlenb +; NOZBA-NEXT: li a1, 14 +; NOZBA-NEXT: mul a0, a0, a1 +; NOZBA-NEXT: sub sp, sp, a0 +; NOZBA-NEXT: andi sp, sp, -32 +; NOZBA-NEXT: addi sp, s0, -48 +; NOZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOZBA-NEXT: addi sp, sp, 48 +; NOZBA-NEXT: ret +; +; ZBA-LABEL: lmul4_and_2_x2_0: +; ZBA: # %bb.0: +; ZBA-NEXT: addi sp, sp, -48 +; ZBA-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZBA-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZBA-NEXT: addi s0, sp, 48 +; ZBA-NEXT: csrr a0, vlenb +; ZBA-NEXT: li a1, 14 +; ZBA-NEXT: mul a0, a0, a1 +; ZBA-NEXT: sub sp, sp, a0 +; ZBA-NEXT: andi sp, sp, -32 +; ZBA-NEXT: addi sp, s0, -48 +; ZBA-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZBA-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZBA-NEXT: addi sp, sp, 48 +; ZBA-NEXT: ret +; +; NOMUL-LABEL: lmul4_and_2_x2_0: +; NOMUL: # %bb.0: +; NOMUL-NEXT: addi sp, sp, -48 +; NOMUL-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; NOMUL-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; NOMUL-NEXT: addi s0, sp, 48 +; NOMUL-NEXT: csrr a0, vlenb +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: mv a1, a0 +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: add a1, a1, a0 +; NOMUL-NEXT: slli a0, a0, 1 +; NOMUL-NEXT: add a0, a0, a1 +; NOMUL-NEXT: sub sp, sp, a0 +; NOMUL-NEXT: andi sp, sp, -32 +; NOMUL-NEXT: addi sp, s0, -48 +; NOMUL-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; NOMUL-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; NOMUL-NEXT: addi sp, sp, 48 +; NOMUL-NEXT: ret %v1 = alloca %v2 = alloca %v3 = alloca diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll index 2177bbfe5b2a4..c1ce2e988fc51 100644 --- a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll @@ -7,7 +7,6 @@ define @test_vector_std( %va) nounwind { ; SPILL-O2: # %bb.0: # %entry ; SPILL-O2-NEXT: addi sp, sp, -16 ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: sub sp, sp, a0 ; SPILL-O2-NEXT: addi a0, sp, 16 ; SPILL-O2-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -15,7 +14,6 @@ define @test_vector_std( %va) nounwind { ; SPILL-O2-NEXT: #NO_APP ; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: add sp, sp, a0 ; SPILL-O2-NEXT: addi sp, sp, 16 ; SPILL-O2-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir index 600084632ce68..c4bc794b8aeb3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir +++ b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir @@ -83,7 +83,7 @@ body: | ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $x8, 0 ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -272 ; CHECK-NEXT: $x10 = frame-setup PseudoReadVLENB - ; CHECK-NEXT: $x11 = frame-setup ADDI $x0, 52 + ; CHECK-NEXT: $x11 = frame-setup ADDI $x0, 51 ; CHECK-NEXT: $x10 = frame-setup MUL killed $x10, killed $x11 ; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x10 ; CHECK-NEXT: $x2 = frame-setup ANDI $x2, -128 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll index 727e03125176a..bdedc5f33c3a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll @@ -26,9 +26,8 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) { ; RV32-ZFBFMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32-ZFBFMIN-NEXT: .cfi_offset ra, -4 ; RV32-ZFBFMIN-NEXT: csrr a1, vlenb -; RV32-ZFBFMIN-NEXT: slli a1, a1, 1 ; RV32-ZFBFMIN-NEXT: sub sp, sp, a1 -; RV32-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV32-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; RV32-ZFBFMIN-NEXT: addi a1, sp, 32 ; RV32-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV32-ZFBFMIN-NEXT: andi a0, a0, 3 @@ -43,7 +42,6 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) { ; RV32-ZFBFMIN-NEXT: lh a0, 0(a0) ; RV32-ZFBFMIN-NEXT: vmv.v.x v8, a0 ; RV32-ZFBFMIN-NEXT: csrr a0, vlenb -; RV32-ZFBFMIN-NEXT: slli a0, a0, 1 ; RV32-ZFBFMIN-NEXT: add sp, sp, a0 ; RV32-ZFBFMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32-ZFBFMIN-NEXT: addi sp, sp, 48 @@ -56,9 +54,8 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) { ; RV64-ZFBFMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; RV64-ZFBFMIN-NEXT: .cfi_offset ra, -8 ; RV64-ZFBFMIN-NEXT: csrr a1, vlenb -; RV64-ZFBFMIN-NEXT: slli a1, a1, 1 ; RV64-ZFBFMIN-NEXT: sub sp, sp, a1 -; RV64-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV64-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; RV64-ZFBFMIN-NEXT: addi a1, sp, 32 ; RV64-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV64-ZFBFMIN-NEXT: andi a0, a0, 3 @@ -73,7 +70,6 @@ define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) { ; RV64-ZFBFMIN-NEXT: lh a0, 0(a0) ; RV64-ZFBFMIN-NEXT: vmv.v.x v8, a0 ; RV64-ZFBFMIN-NEXT: csrr a0, vlenb -; RV64-ZFBFMIN-NEXT: slli a0, a0, 1 ; RV64-ZFBFMIN-NEXT: add sp, sp, a0 ; RV64-ZFBFMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-ZFBFMIN-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index bf2eb3ff0261a..53059a4f28d42 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -206,9 +206,8 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) { ; RV32-ZFHMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32-ZFHMIN-NEXT: .cfi_offset ra, -4 ; RV32-ZFHMIN-NEXT: csrr a1, vlenb -; RV32-ZFHMIN-NEXT: slli a1, a1, 1 ; RV32-ZFHMIN-NEXT: sub sp, sp, a1 -; RV32-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV32-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; RV32-ZFHMIN-NEXT: addi a1, sp, 32 ; RV32-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV32-ZFHMIN-NEXT: andi a0, a0, 3 @@ -223,7 +222,6 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) { ; RV32-ZFHMIN-NEXT: lh a0, 0(a0) ; RV32-ZFHMIN-NEXT: vmv.v.x v8, a0 ; RV32-ZFHMIN-NEXT: csrr a0, vlenb -; RV32-ZFHMIN-NEXT: slli a0, a0, 1 ; RV32-ZFHMIN-NEXT: add sp, sp, a0 ; RV32-ZFHMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32-ZFHMIN-NEXT: addi sp, sp, 48 @@ -236,9 +234,8 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) { ; RV64-ZFHMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; RV64-ZFHMIN-NEXT: .cfi_offset ra, -8 ; RV64-ZFHMIN-NEXT: csrr a1, vlenb -; RV64-ZFHMIN-NEXT: slli a1, a1, 1 ; RV64-ZFHMIN-NEXT: sub sp, sp, a1 -; RV64-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV64-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; RV64-ZFHMIN-NEXT: addi a1, sp, 32 ; RV64-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV64-ZFHMIN-NEXT: andi a0, a0, 3 @@ -253,7 +250,6 @@ define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) { ; RV64-ZFHMIN-NEXT: lh a0, 0(a0) ; RV64-ZFHMIN-NEXT: vmv.v.x v8, a0 ; RV64-ZFHMIN-NEXT: csrr a0, vlenb -; RV64-ZFHMIN-NEXT: slli a0, a0, 1 ; RV64-ZFHMIN-NEXT: add sp, sp, a0 ; RV64-ZFHMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-ZFHMIN-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index eff56e408d6d5..5911e8248f299 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -641,10 +641,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 66 -; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: slli a3, a2, 6 +; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 66 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb ; RV64-NEXT: addi a2, a1, 256 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v16, (a2) @@ -1065,8 +1065,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 66 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a1, a0, 6 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index d52cbb54c4b2d..805b371f1e3d5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -104,9 +104,10 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) { ; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 4 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 3 * vlenb ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 1 ; RV32-NEXT: add a0, sp, a0 @@ -167,7 +168,8 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) { ; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 @@ -210,9 +212,10 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) { ; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 4 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 3 * vlenb ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 1 ; RV32-NEXT: add a0, sp, a0 @@ -273,7 +276,8 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) { ; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll index 62a479bdedf64..b953cf1f5bed8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll @@ -33,9 +33,8 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b ; ZVFBMIN32-NEXT: .cfi_offset s0, -8 ; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 ; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: slli a0, a0, 1 ; ZVFBMIN32-NEXT: sub sp, sp, a0 -; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb +; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb ; ZVFBMIN32-NEXT: fmv.s fs0, fa0 ; ZVFBMIN32-NEXT: addi a0, sp, 16 ; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -58,7 +57,6 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b ; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 ; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: slli a0, a0, 1 ; ZVFBMIN32-NEXT: add sp, sp, a0 ; ZVFBMIN32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; ZVFBMIN32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -77,9 +75,8 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b ; ZVFBMIN64-NEXT: .cfi_offset s0, -16 ; ZVFBMIN64-NEXT: .cfi_offset fs0, -24 ; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: slli a0, a0, 1 ; ZVFBMIN64-NEXT: sub sp, sp, a0 -; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; ZVFBMIN64-NEXT: fmv.s fs0, fa0 ; ZVFBMIN64-NEXT: addi a0, sp, 32 ; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -104,7 +101,6 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b ; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 ; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: slli a0, a0, 1 ; ZVFBMIN64-NEXT: add sp, sp, a0 ; ZVFBMIN64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; ZVFBMIN64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -144,9 +140,8 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> ; ZVFBMIN32-NEXT: .cfi_offset ra, -4 ; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 ; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: slli a0, a0, 1 ; ZVFBMIN32-NEXT: sub sp, sp, a0 -; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; ZVFBMIN32-NEXT: fmv.s fs0, fa0 ; ZVFBMIN32-NEXT: addi a0, sp, 32 ; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -167,7 +162,6 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> ; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 ; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: slli a0, a0, 1 ; ZVFBMIN32-NEXT: add sp, sp, a0 ; ZVFBMIN32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; ZVFBMIN32-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload @@ -183,9 +177,8 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> ; ZVFBMIN64-NEXT: .cfi_offset ra, -8 ; ZVFBMIN64-NEXT: .cfi_offset fs0, -16 ; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: slli a0, a0, 1 ; ZVFBMIN64-NEXT: sub sp, sp, a0 -; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb ; ZVFBMIN64-NEXT: fmv.s fs0, fa0 ; ZVFBMIN64-NEXT: addi a0, sp, 32 ; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -208,7 +201,6 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> ; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 ; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: slli a0, a0, 1 ; ZVFBMIN64-NEXT: add sp, sp, a0 ; ZVFBMIN64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; ZVFBMIN64-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index 0c180cd148b81..c055039876191 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1844,10 +1844,10 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 10 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) @@ -1888,8 +1888,8 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 10 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -1931,10 +1931,10 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 10 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) @@ -1975,8 +1975,8 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 10 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll index 320db35770cb8..dd01e1c1ee66d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll @@ -294,10 +294,10 @@ define @vfmax_vv_nxv32bf16( %va, @vfmax_vv_nxv32bf16( %va, @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv32f16( %va, @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64( %va, @vfmin_vv_nxv32bf16( %va, @vfmin_vv_nxv32bf16( %va, @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv32f16( %va, @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64( %va, @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -482,7 +483,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -593,9 +595,10 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -643,7 +646,8 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -764,9 +768,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -815,7 +820,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -2250,9 +2256,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2319,7 +2324,6 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, s0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2380,9 +2384,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2404,7 +2407,6 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a2 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2489,9 +2491,8 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2540,7 +2541,6 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a2 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2648,9 +2648,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2717,7 +2716,6 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, s0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2778,9 +2776,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2802,7 +2799,6 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a2 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2887,9 +2883,8 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -2938,7 +2933,6 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a2 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -3767,9 +3761,10 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -3817,7 +3812,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -3926,9 +3922,10 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -3976,7 +3973,8 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -4096,9 +4094,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: .cfi_offset s2, -32 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 2 +; CHECK-V-NEXT: slli a2, a1, 1 +; CHECK-V-NEXT: add a1, a2, a1 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb ; CHECK-V-NEXT: lhu s0, 0(a0) ; CHECK-V-NEXT: lhu s1, 8(a0) ; CHECK-V-NEXT: lhu s2, 16(a0) @@ -4147,7 +4146,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 2 +; CHECK-V-NEXT: slli a1, a0, 1 +; CHECK-V-NEXT: add a0, a1, a0 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -5568,9 +5568,8 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5640,7 +5639,6 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, s0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -5699,9 +5697,8 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5726,7 +5723,6 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -5799,9 +5795,8 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5839,7 +5834,6 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a1 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -5948,9 +5942,8 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -6020,7 +6013,6 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, s0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -6079,9 +6071,8 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -6106,7 +6097,6 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a0 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -6179,9 +6169,8 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: .cfi_offset s0, -16 ; CHECK-V-NEXT: .cfi_offset s1, -24 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: sub sp, sp, a0 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb ; CHECK-V-NEXT: addi a0, sp, 32 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma @@ -6219,7 +6208,6 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-V-NEXT: vmv.s.x v9, a1 ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 48(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll index 0e102d98c79cf..ccfe94ecad286 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll @@ -87,7 +87,6 @@ define @just_call( %0) nounwind { ; CHECK-NEXT: addi sp, sp, -48 ; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 32 ; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -95,7 +94,6 @@ define @just_call( %0) nounwind { ; CHECK-NEXT: addi a0, sp, 32 ; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 48 @@ -106,7 +104,6 @@ define @just_call( %0) nounwind { ; UNOPT-NEXT: addi sp, sp, -48 ; UNOPT-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; UNOPT-NEXT: csrr a0, vlenb -; UNOPT-NEXT: slli a0, a0, 1 ; UNOPT-NEXT: sub sp, sp, a0 ; UNOPT-NEXT: addi a0, sp, 32 ; UNOPT-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -114,7 +111,6 @@ define @just_call( %0) nounwind { ; UNOPT-NEXT: addi a0, sp, 32 ; UNOPT-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; UNOPT-NEXT: csrr a0, vlenb -; UNOPT-NEXT: slli a0, a0, 1 ; UNOPT-NEXT: add sp, sp, a0 ; UNOPT-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; UNOPT-NEXT: addi sp, sp, 48 @@ -130,7 +126,6 @@ define @before_call1( %0, @before_call1( %0, @before_call1( %0, @before_call1( %0, @before_call2( %0, @before_call2( %0, @before_call2( %0, @before_call2( %0, @after_call1( %0, @after_call1( %0, @after_call1( %0, @after_call1( %0, @after_call2( %0, @after_call2( %0, @after_call2( %0, @after_call2( %0, @spill_lmul_mf2( %va) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: sub sp, sp, a0 ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -18,7 +17,6 @@ define @spill_lmul_mf2( %va) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret @@ -27,7 +25,6 @@ define @spill_lmul_mf2( %va) nounwind { ; SPILL-O2: # %bb.0: # %entry ; SPILL-O2-NEXT: addi sp, sp, -16 ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: sub sp, sp, a0 ; SPILL-O2-NEXT: addi a0, sp, 16 ; SPILL-O2-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -35,7 +32,6 @@ define @spill_lmul_mf2( %va) nounwind { ; SPILL-O2-NEXT: #NO_APP ; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: add sp, sp, a0 ; SPILL-O2-NEXT: addi sp, sp, 16 ; SPILL-O2-NEXT: ret @@ -51,7 +47,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: sub sp, sp, a0 ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -60,7 +55,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret @@ -69,7 +63,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O2: # %bb.0: # %entry ; SPILL-O2-NEXT: addi sp, sp, -16 ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: sub sp, sp, a0 ; SPILL-O2-NEXT: addi a0, sp, 16 ; SPILL-O2-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -77,7 +70,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O2-NEXT: #NO_APP ; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: add sp, sp, a0 ; SPILL-O2-NEXT: addi sp, sp, 16 ; SPILL-O2-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll index c12df627b49d6..b34952b64f09e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll @@ -11,7 +11,6 @@ define @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a2, vlenb -; SPILL-O0-NEXT: slli a2, a2, 1 ; SPILL-O0-NEXT: sub sp, sp, a2 ; SPILL-O0-NEXT: # implicit-def: $v8_v9 ; SPILL-O0-NEXT: vsetvli zero, a1, e32, mf2, tu, ma @@ -24,7 +23,6 @@ define @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret @@ -88,7 +86,6 @@ define @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a2, vlenb -; SPILL-O0-NEXT: slli a2, a2, 1 ; SPILL-O0-NEXT: sub sp, sp, a2 ; SPILL-O0-NEXT: # implicit-def: $v8_v9 ; SPILL-O0-NEXT: vsetvli zero, a1, e32, m1, tu, ma @@ -101,7 +98,6 @@ define @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll index 30075c2dad516..1e6ff0baddaef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector.ll @@ -9,7 +9,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: sub sp, sp, a0 ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -18,7 +17,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret @@ -27,7 +25,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O2: # %bb.0: # %entry ; SPILL-O2-NEXT: addi sp, sp, -16 ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: sub sp, sp, a0 ; SPILL-O2-NEXT: addi a0, sp, 16 ; SPILL-O2-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -35,7 +32,6 @@ define @spill_lmul_1( %va) nounwind { ; SPILL-O2-NEXT: #NO_APP ; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O2-NEXT: csrr a0, vlenb -; SPILL-O2-NEXT: slli a0, a0, 1 ; SPILL-O2-NEXT: add sp, sp, a0 ; SPILL-O2-NEXT: addi sp, sp, 16 ; SPILL-O2-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll index afb4b1560728c..361adb55ef12f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll @@ -11,7 +11,6 @@ define @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a2, vlenb -; SPILL-O0-NEXT: slli a2, a2, 1 ; SPILL-O0-NEXT: sub sp, sp, a2 ; SPILL-O0-NEXT: # implicit-def: $v8_v9 ; SPILL-O0-NEXT: vsetvli zero, a1, e32, mf2, tu, ma @@ -24,7 +23,6 @@ define @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret @@ -88,7 +86,6 @@ define @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind { ; SPILL-O0: # %bb.0: # %entry ; SPILL-O0-NEXT: addi sp, sp, -16 ; SPILL-O0-NEXT: csrr a2, vlenb -; SPILL-O0-NEXT: slli a2, a2, 1 ; SPILL-O0-NEXT: sub sp, sp, a2 ; SPILL-O0-NEXT: # implicit-def: $v8_v9 ; SPILL-O0-NEXT: vsetvli zero, a1, e32, m1, tu, ma @@ -101,7 +98,6 @@ define @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind { ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload ; SPILL-O0-NEXT: csrr a0, vlenb -; SPILL-O0-NEXT: slli a0, a0, 1 ; SPILL-O0-NEXT: add sp, sp, a0 ; SPILL-O0-NEXT: addi sp, sp, 16 ; SPILL-O0-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll b/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll index 409ef50aa53c8..fcb5f07664aa5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll +++ b/llvm/test/CodeGen/RISCV/rvv/scalar-stack-align.ll @@ -1,49 +1,77 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefix=RV32 +; RUN: | FileCheck %s --check-prefixes=RV32,RV32-ZVE64 ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefix=RV64 +; RUN: | FileCheck %s --check-prefixes=RV64,RV64-ZVE64 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefix=RV32 +; RUN: | FileCheck %s --check-prefixes=RV32,RV32-V ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefix=RV64 - -; FIXME: We are over-aligning the stack on V, wasting stack space. +; RUN: | FileCheck %s --check-prefixes=RV64,RV64-V define ptr @scalar_stack_align16() nounwind { -; RV32-LABEL: scalar_stack_align16: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: addi a0, sp, 32 -; RV32-NEXT: call extern -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add sp, sp, a1 -; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 48 -; RV32-NEXT: ret +; RV32-ZVE64-LABEL: scalar_stack_align16: +; RV32-ZVE64: # %bb.0: +; RV32-ZVE64-NEXT: addi sp, sp, -48 +; RV32-ZVE64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-ZVE64-NEXT: csrr a0, vlenb +; RV32-ZVE64-NEXT: slli a0, a0, 1 +; RV32-ZVE64-NEXT: sub sp, sp, a0 +; RV32-ZVE64-NEXT: addi a0, sp, 32 +; RV32-ZVE64-NEXT: call extern +; RV32-ZVE64-NEXT: addi a0, sp, 16 +; RV32-ZVE64-NEXT: csrr a1, vlenb +; RV32-ZVE64-NEXT: slli a1, a1, 1 +; RV32-ZVE64-NEXT: add sp, sp, a1 +; RV32-ZVE64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-ZVE64-NEXT: addi sp, sp, 48 +; RV32-ZVE64-NEXT: ret +; +; RV64-ZVE64-LABEL: scalar_stack_align16: +; RV64-ZVE64: # %bb.0: +; RV64-ZVE64-NEXT: addi sp, sp, -48 +; RV64-ZVE64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-ZVE64-NEXT: csrr a0, vlenb +; RV64-ZVE64-NEXT: slli a0, a0, 1 +; RV64-ZVE64-NEXT: sub sp, sp, a0 +; RV64-ZVE64-NEXT: addi a0, sp, 32 +; RV64-ZVE64-NEXT: call extern +; RV64-ZVE64-NEXT: addi a0, sp, 16 +; RV64-ZVE64-NEXT: csrr a1, vlenb +; RV64-ZVE64-NEXT: slli a1, a1, 1 +; RV64-ZVE64-NEXT: add sp, sp, a1 +; RV64-ZVE64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-ZVE64-NEXT: addi sp, sp, 48 +; RV64-ZVE64-NEXT: ret +; +; RV32-V-LABEL: scalar_stack_align16: +; RV32-V: # %bb.0: +; RV32-V-NEXT: addi sp, sp, -48 +; RV32-V-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-V-NEXT: csrr a0, vlenb +; RV32-V-NEXT: sub sp, sp, a0 +; RV32-V-NEXT: addi a0, sp, 32 +; RV32-V-NEXT: call extern +; RV32-V-NEXT: addi a0, sp, 16 +; RV32-V-NEXT: csrr a1, vlenb +; RV32-V-NEXT: add sp, sp, a1 +; RV32-V-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-V-NEXT: addi sp, sp, 48 +; RV32-V-NEXT: ret ; -; RV64-LABEL: scalar_stack_align16: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -48 -; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: sub sp, sp, a0 -; RV64-NEXT: addi a0, sp, 32 -; RV64-NEXT: call extern -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add sp, sp, a1 -; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 48 -; RV64-NEXT: ret +; RV64-V-LABEL: scalar_stack_align16: +; RV64-V: # %bb.0: +; RV64-V-NEXT: addi sp, sp, -48 +; RV64-V-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-V-NEXT: csrr a0, vlenb +; RV64-V-NEXT: sub sp, sp, a0 +; RV64-V-NEXT: addi a0, sp, 32 +; RV64-V-NEXT: call extern +; RV64-V-NEXT: addi a0, sp, 16 +; RV64-V-NEXT: csrr a1, vlenb +; RV64-V-NEXT: add sp, sp, a1 +; RV64-V-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-V-NEXT: addi sp, sp, 48 +; RV64-V-NEXT: ret %a = alloca %c = alloca i64, align 16 call void @extern(ptr %a) @@ -51,3 +79,6 @@ define ptr @scalar_stack_align16() nounwind { } declare void @extern(ptr) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll index f966835622a9f..ffe6ff8a91abd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll @@ -8,9 +8,8 @@ define i64 @i64( %v, i1 %c) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: andi a0, a0, 1 @@ -29,7 +28,6 @@ define i64 @i64( %v, i1 %c) { ; RV32-NEXT: li a1, 0 ; RV32-NEXT: .LBB0_3: # %falsebb ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 1 ; RV32-NEXT: add sp, sp, a2 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -39,9 +37,8 @@ define i64 @i64( %v, i1 %c) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: andi a0, a0, 1 @@ -52,7 +49,6 @@ define i64 @i64( %v, i1 %c) { ; RV64-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: .LBB0_2: # %falsebb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -71,9 +67,8 @@ define i32 @i32( %v, i1 %c) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: andi a0, a0, 1 @@ -84,7 +79,6 @@ define i32 @i32( %v, i1 %c) { ; CHECK-NEXT: lw a0, 16(sp) # 8-byte Folded Reload ; CHECK-NEXT: .LBB1_2: # %falsebb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add sp, sp, a1 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -103,9 +97,8 @@ define i16 @i16( %v, i1 %c) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: andi a0, a0, 1 @@ -116,7 +109,6 @@ define i16 @i16( %v, i1 %c) { ; CHECK-NEXT: lh a0, 16(sp) # 8-byte Folded Reload ; CHECK-NEXT: .LBB2_2: # %falsebb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add sp, sp, a1 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -135,9 +127,8 @@ define i8 @i8( %v, i1 %c) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: andi a0, a0, 1 @@ -148,7 +139,6 @@ define i8 @i8( %v, i1 %c) { ; CHECK-NEXT: lb a0, 16(sp) # 8-byte Folded Reload ; CHECK-NEXT: .LBB3_2: # %falsebb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add sp, sp, a1 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -167,9 +157,8 @@ define double @f64( %v, i1 %c) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: andi a0, a0, 1 @@ -183,7 +172,6 @@ define double @f64( %v, i1 %c) { ; RV32-NEXT: fcvt.d.w fa0, zero ; RV32-NEXT: .LBB4_3: # %falsebb ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -193,9 +181,8 @@ define double @f64( %v, i1 %c) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: andi a0, a0, 1 @@ -209,7 +196,6 @@ define double @f64( %v, i1 %c) { ; RV64-NEXT: fmv.d.x fa0, zero ; RV64-NEXT: .LBB4_3: # %falsebb ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -228,9 +214,8 @@ define float @f32( %v, i1 %c) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: andi a0, a0, 1 @@ -244,7 +229,6 @@ define float @f32( %v, i1 %c) { ; CHECK-NEXT: fmv.w.x fa0, zero ; CHECK-NEXT: .LBB5_3: # %falsebb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll index b3de904d20622..4c298ab2b5e6d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll @@ -508,12 +508,10 @@ define @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -577,10 +575,8 @@ define @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -1301,12 +1297,10 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 4 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -1370,10 +1364,8 @@ define @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a1, a0, 4 +; ZVFHMIN-NEXT: add a0, a1, a0 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll index aa39fe5b5ec85..0fe6c5dec4264 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll @@ -470,12 +470,10 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -539,10 +537,8 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -1213,12 +1209,10 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 4 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -1282,10 +1276,8 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a1, a0, 4 +; ZVFHMIN-NEXT: add a0, a1, a0 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index baecb7bb7d248..f0c74d064016a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -642,14 +642,13 @@ define @vfma_vv_nxv32bf16( %va, @vfma_vv_nxv32bf16( %va, @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v24, a1 @@ -1038,9 +1035,8 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add a0, a0, a1 @@ -1059,14 +1055,13 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v24, a1 @@ -1200,9 +1195,8 @@ define @vfma_vf_nxv32bf16_commute( ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add a0, a0, a1 @@ -2247,14 +2241,13 @@ define @vfma_vv_nxv32f16( %va, @vfma_vv_nxv32f16( %va, @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a2, a2, a1 ; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 @@ -2656,9 +2647,8 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a1, a1, a0 ; ZVFHMIN-NEXT: slli a0, a0, 2 ; ZVFHMIN-NEXT: add a0, a0, a1 @@ -2683,14 +2673,13 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 ; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a2, a2, a1 ; ZVFHMIN-NEXT: slli a1, a1, 2 ; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v24, a1 @@ -2824,9 +2813,8 @@ define @vfma_vf_nxv32f16_commute( %va, ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 ; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a1, a1, a0 ; ZVFHMIN-NEXT: slli a0, a0, 2 ; ZVFHMIN-NEXT: add a0, a0, a1 @@ -8663,14 +8651,13 @@ define @vfmsub_vv_nxv32f16( %va, @vfmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 4 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -673,10 +671,8 @@ define @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a1, a0, 4 +; ZVFHMIN-NEXT: add a0, a1, a0 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll index 449130e59876f..dd57b65b50f4f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll @@ -470,12 +470,10 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a1, fa0 ; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -539,10 +537,8 @@ define @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -1213,12 +1209,10 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 1 -; ZVFHMIN-NEXT: mv a2, a1 -; ZVFHMIN-NEXT: slli a1, a1, 3 -; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: slli a2, a1, 4 +; ZVFHMIN-NEXT: add a1, a2, a1 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 18 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: vsetvli a2, zero, e16, m8, ta, ma @@ -1282,10 +1276,8 @@ define @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a1, a0, 4 +; ZVFHMIN-NEXT: add a0, a1, a0 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: addi sp, sp, 16 ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll index 4d715c7031000..0028f3035c273 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -2469,10 +2469,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 10 -; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x0a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 10 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb ; RV64-NEXT: vl4re16.v v24, (a1) ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill @@ -2509,8 +2509,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 10 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll index a869b433a4952..72f25268109a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll @@ -75,7 +75,6 @@ define @test3( %0, %1, @test3( %0, %1, @test3( %0, %1, @test3( %0, %1, Date: Mon, 30 Sep 2024 11:44:49 -0700 Subject: [PATCH 012/151] [LegalizeVectorOps] Enable ExpandFABS/COPYSIGN to use integer ops for fixed vectors in some cases. (#109232) Copy the same FSUB check from ExpandFNEG to avoid breaking AArch64 and ARM. --- .../SelectionDAG/LegalizeVectorOps.cpp | 23 +- .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 33 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 128 +- llvm/test/CodeGen/PowerPC/vec_abs.ll | 14 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 2059 ++--------------- llvm/test/CodeGen/Thumb2/mve-fmath.ll | 214 +- .../CodeGen/WebAssembly/simd-unsupported.ll | 12 +- 7 files changed, 381 insertions(+), 2102 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 5d433204d5da0..0adf3cfb34c94 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1804,9 +1804,12 @@ SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) { EVT VT = Node->getValueType(0); EVT IntVT = VT.changeVectorElementTypeToInteger(); + if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT)) + return SDValue(); + // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. - if (!TLI.isOperationLegalOrCustom(ISD::XOR, IntVT) || - !(TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) || VT.isScalableVector())) + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && + !VT.isScalableVector()) return SDValue(); SDLoc DL(Node); @@ -1821,8 +1824,12 @@ SDValue VectorLegalizer::ExpandFABS(SDNode *Node) { EVT VT = Node->getValueType(0); EVT IntVT = VT.changeVectorElementTypeToInteger(); - // FIXME: We shouldn't restrict this to scalable vectors. - if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || !VT.isScalableVector()) + if (!TLI.isOperationLegalOrCustom(ISD::AND, IntVT)) + return SDValue(); + + // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && + !VT.isScalableVector()) return SDValue(); SDLoc DL(Node); @@ -1837,10 +1844,14 @@ SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) { EVT VT = Node->getValueType(0); EVT IntVT = VT.changeVectorElementTypeToInteger(); - // FIXME: We shouldn't restrict this to scalable vectors. if (VT != Node->getOperand(1).getValueType() || !TLI.isOperationLegalOrCustom(ISD::AND, IntVT) || - !TLI.isOperationLegalOrCustom(ISD::OR, IntVT) || !VT.isScalableVector()) + !TLI.isOperationLegalOrCustom(ISD::OR, IntVT)) + return SDValue(); + + // FIXME: The FSUB check is here to force unrolling v1f64 vectors on AArch64. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::FSUB, VT) && + !VT.isScalableVector()) return SDValue(); SDLoc DL(Node); diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 7030e5435f723..8d40a9ef54dca 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -508,21 +508,24 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 { ; CHECK-LABEL: test_copysign( ; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_param_0]; ; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: abs.bf16 [[AW1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: neg.bf16 [[AY1:%rs[0-9]+]], [[AW1]]; -; CHECK-DAG: shr.u16 [[BS1:%rs[0-9]+]], [[B1]], 15; -; CHECK-DAG: and.b16 [[BR1:%rs[0-9]+]], [[BS1]], 1; -; CHECK-DAG: setp.eq.b16 [[P1:%p[0-9]+]], [[BR1]], 1; -; CHECK-DAG: selp.b16 [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]] -; CHECK-DAG: abs.bf16 [[AW0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: neg.bf16 [[AY0:%rs[0-9]+]], [[AW0]]; -; CHECK-DAG: shr.u16 [[BS0:%rs[0-9]+]], [[B0]], 15; -; CHECK-DAG: and.b16 [[BR0:%rs[0-9]+]], [[BS0]], 1; -; CHECK-DAG: setp.eq.b16 [[P0:%p[0-9]+]], [[BR0]], 1; -; CHECK-DAG: selp.b16 [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]] -; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS0]], [[RS1]]} +; SM80-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; SM80-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; SM80-DAG: abs.bf16 [[AW1:%rs[0-9]+]], [[A1]]; +; SM80-DAG: neg.bf16 [[AY1:%rs[0-9]+]], [[AW1]]; +; SM80-DAG: shr.u16 [[BS1:%rs[0-9]+]], [[B1]], 15; +; SM80-DAG: and.b16 [[BR1:%rs[0-9]+]], [[BS1]], 1; +; SM80-DAG: setp.eq.b16 [[P1:%p[0-9]+]], [[BR1]], 1; +; SM80-DAG: selp.b16 [[RS1:%rs[0-9]+]], [[AY1]], [[AW1]], [[P1]] +; SM80-DAG: abs.bf16 [[AW0:%rs[0-9]+]], [[A0]]; +; SM80-DAG: neg.bf16 [[AY0:%rs[0-9]+]], [[AW0]]; +; SM80-DAG: shr.u16 [[BS0:%rs[0-9]+]], [[B0]], 15; +; SM80-DAG: and.b16 [[BR0:%rs[0-9]+]], [[BS0]], 1; +; SM80-DAG: setp.eq.b16 [[P0:%p[0-9]+]], [[BR0]], 1; +; SM80-DAG: selp.b16 [[RS0:%rs[0-9]+]], [[AY0]], [[AW0]], [[P0]] +; SM80-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS0]], [[RS1]]} +; SM90-DAG: and.b32 [[R1:%r[0-9]+]], [[B]], -2147450880; +; SM90-DAG: and.b32 [[R2:%r[0-9]+]], [[A]], 2147450879; +; SM90-DAG: or.b32 [[R:%r[0-9]+]], [[R2]], [[R1]]; ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 464b3a754804f..b41f63b783d39 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1184,14 +1184,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-LABEL: test_fabs( ; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NOF16: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-NOF16-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-NOF16-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; +; CHECK-NOF16: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-F16: and.b32 [[R:%r[0-9]+]], [[A]], 2147450879; ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x half> @test_fabs(<2 x half> %a) #0 { @@ -1244,15 +1245,18 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_copysign( ; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_param_0]; ; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; -; CHECK-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; +; CHECK-NOF16-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; +; CHECK-NOF16-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; +; CHECK-NOF16-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; +; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-F16-DAG: and.b32 [[R0:%r[0-9]+]], [[B]], -2147450880; +; CHECK-F16-DAG: and.b32 [[R1:%r[0-9]+]], [[A]], 2147450879; +; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R1]], [[R0]] ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { @@ -1263,18 +1267,24 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_copysign_f32( ; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_f32_param_0]; ; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; -; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; -; CHECK-DAG: mov.b32 {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; } -; CHECK-DAG: mov.b32 {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; } -; CHECK-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; +; CHECK-NOF16-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; +; CHECK-NOF16-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; +; CHECK-NOF16-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; +; CHECK-NOF16-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; +; CHECK-NOF16-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; +; CHECK-NOF16-DAG: mov.b32 {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; } +; CHECK-NOF16-DAG: mov.b32 {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; } +; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-F16-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[B1]]; +; CHECK-F16-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[B0]]; +; CHECK-F16-DAG: mov.b32 [[R2:%r[0-9]+]], {[[R1]], [[R0]]}; +; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; +; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; +; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]] ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { @@ -1286,20 +1296,26 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_copysign_f64( ; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_f64_param_0]; ; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; -; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; -; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; -; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; -; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; +; CHECK-NOF16-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; +; CHECK-NOF16-DAG: and.b16 [[AI0:%rs[0-9]+]], [[A0]], 32767; +; CHECK-NOF16-DAG: and.b16 [[AI1:%rs[0-9]+]], [[A1]], 32767; +; CHECK-NOF16-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; +; CHECK-NOF16-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; +; CHECK-NOF16-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; +; CHECK-NOF16-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; +; CHECK-NOF16-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-NOF16-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-NOF16-DAG: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-F16-DAG: cvt.rn.f16.f64 [[R0:%rs[0-9]+]], [[B1]]; +; CHECK-F16-DAG: cvt.rn.f16.f64 [[R1:%rs[0-9]+]], [[B0]]; +; CHECK-F16-DAG: mov.b32 [[R2:%r[0-9]+]], {[[R1]], [[R0]]}; +; CHECK-F16-DAG: and.b32 [[R3:%r[0-9]+]], [[R2]], -2147450880; +; CHECK-F16-DAG: and.b32 [[R4:%r[0-9]+]], [[A]], 2147450879; +; CHECK-F16-DAG: or.b32 [[R:%r[0-9]+]], [[R4]], [[R3]]; ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { @@ -1311,16 +1327,22 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_copysign_extended( ; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_copysign_extended_param_0]; ; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; -; CHECK-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R0]]; -; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R1]]; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: and.b16 [[AX0:%rs[0-9]+]], [[A0]], 32767; +; CHECK-NOF16-DAG: and.b16 [[AX1:%rs[0-9]+]], [[A1]], 32767; +; CHECK-NOF16-DAG: and.b16 [[BX0:%rs[0-9]+]], [[B0]], -32768; +; CHECK-NOF16-DAG: and.b16 [[BX1:%rs[0-9]+]], [[B1]], -32768; +; CHECK-NOF16-DAG: or.b16 [[R0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-NOF16-DAG: or.b16 [[R1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R1]]; +; CHECK-F16-DAG: and.b32 [[R0:%r[0-9]+]], [[B]], -2147450880; +; CHECK-F16-DAG: and.b32 [[R1:%r[0-9]+]], [[A]], 2147450879; +; CHECK-F16-DAG: or.b32 [[R2:%r[0-9]+]], [[R1]], [[R0]] +; CHECK-F16-DAG: mov.b32 {[[R3:%rs[0-9]+]], [[R4:%rs[0-9]+]]}, [[R2]] +; CHECK-F16-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[R3]] +; CHECK-F16-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[R4]] ; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; ; CHECK: ret; define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { diff --git a/llvm/test/CodeGen/PowerPC/vec_abs.ll b/llvm/test/CodeGen/PowerPC/vec_abs.ll index 50dcfc3faf62e..b900f0ea29c4a 100644 --- a/llvm/test/CodeGen/PowerPC/vec_abs.ll +++ b/llvm/test/CodeGen/PowerPC/vec_abs.ll @@ -19,10 +19,9 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1 ; CHECK: xvabssp ; CHECK: blr -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs +; CHECK-NOVSX: vspltisb +; CHECK-NOVSX: vslw +; CHECK-NOVSX: vandc ; CHECK-NOVSX: blr define <4 x float> @test2_float(<4 x float> %aa) #0 { @@ -40,11 +39,8 @@ define <4 x float> @test2_float(<4 x float> %aa) #0 { ; CHECK: xvnabssp ; CHECK: blr ; CHECK-NOVSX: vspltisb -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs -; CHECK-NOVSX: fabs -; CHECK-NOVSX: vxor +; CHECK-NOVSX: vslw +; CHECK-NOVSX: vor ; CHECK-NOVSX: blr define <2 x double> @test_double(<2 x double> %aa) #0 { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index d665d23dec68a..69faf269ae3db 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -508,85 +508,15 @@ define void @fabs_v8f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fabs_v8f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: lhu a1, 2(sp) -; ZVFHMIN-RV32-NEXT: lui a2, 8 -; ZVFHMIN-RV32-NEXT: lhu a3, 0(sp) -; ZVFHMIN-RV32-NEXT: addi a2, a2, -1 -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: lhu a4, 4(sp) -; ZVFHMIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-RV32-NEXT: lhu a1, 6(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: lhu a3, 10(sp) -; ZVFHMIN-RV32-NEXT: lhu a4, 8(sp) -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-RV32-NEXT: lhu a1, 12(sp) -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-RV32-NEXT: lhu a4, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV32-NEXT: and a2, a4, a2 -; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fabs_v8f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: lhu a1, 2(sp) -; ZVFHMIN-RV64-NEXT: lui a2, 8 -; ZVFHMIN-RV64-NEXT: lhu a3, 0(sp) -; ZVFHMIN-RV64-NEXT: addiw a2, a2, -1 -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: lhu a4, 4(sp) -; ZVFHMIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-RV64-NEXT: lhu a1, 6(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: lhu a3, 10(sp) -; ZVFHMIN-RV64-NEXT: lhu a4, 8(sp) -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-RV64-NEXT: lhu a1, 12(sp) -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-RV64-NEXT: lhu a4, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV64-NEXT: and a2, a4, a2 -; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fabs_v8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a) store <8 x half> %b, ptr %x @@ -603,89 +533,15 @@ define void @fabs_v6f16(ptr %x) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-RV32-LABEL: fabs_v6f16: -; ZVFHMIN-RV32: # %bb.0: -; ZVFHMIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV32-NEXT: lhu a1, 2(sp) -; ZVFHMIN-RV32-NEXT: lui a2, 8 -; ZVFHMIN-RV32-NEXT: lhu a3, 0(sp) -; ZVFHMIN-RV32-NEXT: addi a2, a2, -1 -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: lhu a4, 4(sp) -; ZVFHMIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-RV32-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-RV32-NEXT: lhu a1, 6(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV32-NEXT: lhu a3, 10(sp) -; ZVFHMIN-RV32-NEXT: lhu a4, 8(sp) -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-RV32-NEXT: lhu a1, 12(sp) -; ZVFHMIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-RV32-NEXT: lhu a4, 14(sp) -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV32-NEXT: and a2, a4, a2 -; ZVFHMIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV32-NEXT: ret -; -; ZVFHMIN-RV64-LABEL: fabs_v6f16: -; ZVFHMIN-RV64: # %bb.0: -; ZVFHMIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-RV64-NEXT: lhu a1, 2(sp) -; ZVFHMIN-RV64-NEXT: lui a2, 8 -; ZVFHMIN-RV64-NEXT: lhu a3, 0(sp) -; ZVFHMIN-RV64-NEXT: addiw a2, a2, -1 -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: lhu a4, 4(sp) -; ZVFHMIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-RV64-NEXT: vmv.v.x v8, a3 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-RV64-NEXT: lhu a1, 6(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-RV64-NEXT: lhu a3, 10(sp) -; ZVFHMIN-RV64-NEXT: lhu a4, 8(sp) -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-RV64-NEXT: lhu a1, 12(sp) -; ZVFHMIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-RV64-NEXT: lhu a4, 14(sp) -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-RV64-NEXT: and a2, a4, a2 -; ZVFHMIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: fabs_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = call <6 x half> @llvm.fabs.v6f16(<6 x half> %a) store <6 x half> %b, ptr %x @@ -741,255 +597,18 @@ define void @copysign_v8f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_v8f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_v8f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v8f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a2, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, a5, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a6, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 22(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a3, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a5, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v8f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a2, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, a5, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a6, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a3, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a5, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_v8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y %c = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) @@ -1008,263 +627,20 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_v6f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa1, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_v6f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa1, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa3, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_v6f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a2, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, a5, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a6, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 22(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a3, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a5, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_v6f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a2, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a2, a5, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a6, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a3, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a5, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = call <6 x half> @llvm.copysign.v6f16(<6 x half> %a, <6 x half> %b) @@ -1325,199 +701,19 @@ define void @copysign_vf_v8f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v8f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v8f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v8f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, a3, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v8f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, a3, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_vf_v8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa0 +; ZVFHMIN-NEXT: vmv.v.x v9, a1 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a2 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 %c = shufflevector <8 x half> %b, <8 x half> poison, <8 x i32> zeroinitializer @@ -1535,211 +731,25 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_vf_v6f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.h fa5, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_vf_v6f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa0 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.h fa5, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_vf_v6f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a3, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a4, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a4, a4, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a3, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a3, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a3, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a1, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a3, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a3, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a3, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_vf_v6f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a1, fa0 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a3, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a4, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a4, a4, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a3, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a3, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a3, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a1, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a3, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a3, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a1, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a3, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a3, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_vf_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: fmv.x.h a1, fa5 +; ZVFHMIN-NEXT: li a2, 192 +; ZVFHMIN-NEXT: vmv.s.x v0, a2 +; ZVFHMIN-NEXT: fmv.x.h a2, fa0 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v9, a2 +; ZVFHMIN-NEXT: vmerge.vxm v9, v9, a1, v0 +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = insertelement <6 x half> poison, half %y, i32 0 %c = shufflevector <6 x half> %b, <6 x half> poison, <6 x i32> zeroinitializer @@ -1798,261 +808,19 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v8f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v8f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v8f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, a3, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a5, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a7, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a7, 22(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v8f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a1, a3, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a5, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a7, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a7, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_neg_v8f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a2 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y %c = fneg <8 x half> %b @@ -2071,269 +839,21 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_v6f16: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 18(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 16(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa3, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 20(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 26(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 24(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa4, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa2 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 30(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_v6f16: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 18(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 16(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa3, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 20(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 22(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 26(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a4, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 24(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa4, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa2 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 28(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 30(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v9, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFH-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_v6f16: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a1, a3, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a5, 18(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 16(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a6, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a5, a5, a6 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a7, 20(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a7, 22(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 26(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 24(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a7, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a5, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 28(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 30(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_v6f16: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -32 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a3, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a1, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a1, a3, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a5, 18(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a2, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 16(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a6, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a5, a5, a6 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a7, 20(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a7, 22(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a7, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 26(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 24(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a4, a5, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a6, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a7, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a5, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 28(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a5, a5, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a3, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 30(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a3, a5, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a6, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a1, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.i v0, 15 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v9, v9, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 6, e16, mf2, ta, mu -; ZVFHMIN-ZFHIN-RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v9, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 32 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_neg_v6f16: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a1) +; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: vsetivli zero, 8, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 +; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a2 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vsetivli zero, 6, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y %c = fneg <6 x half> %b @@ -2396,169 +916,20 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v4f16_v4f32: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, a1, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a7, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v4f16_v4f32: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, a1, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a7, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_neg_trunc_v4f16_v4f32: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle32.v v9, (a1) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a2 +; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: vxor.vx v9, v10, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <4 x half>, ptr %x %b = load <4 x float>, ptr %y %c = fneg <4 x float> %b @@ -2582,177 +953,22 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; -; ZVFHMIN-ZFH-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-ZFH-RV32: # %bb.0: -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFH-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa3, 8(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa1, 12(sp) -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFH-RV32-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV32-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV32-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV32-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV32-NEXT: ret -; -; ZVFHMIN-ZFH-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-ZFH-RV64: # %bb.0: -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFH-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFH-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: addi a1, sp, 8 -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 2(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 10(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 0(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa3, 8(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa2, 4(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa1, 12(sp) -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa4, fa3 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a2, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa2, fa1 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a3, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: flh fa5, 6(sp) -; ZVFHMIN-ZFH-RV64-NEXT: flh fa4, 14(sp) -; ZVFHMIN-ZFH-RV64-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a3 -; ZVFHMIN-ZFH-RV64-NEXT: fsgnj.h fa5, fa5, fa4 -; ZVFHMIN-ZFH-RV64-NEXT: fmv.x.h a1, fa5 -; ZVFHMIN-ZFH-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFH-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFH-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFH-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFH-RV64-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV32-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-ZFHIN-RV32: # %bb.0: -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV32-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a1, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a2, sp, 8 -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a2) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi a3, a1, -1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a4, 10(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a2, a2, a4 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a6, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a7, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a4, 4(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 12(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: lh a6, 14(sp) -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV32-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV32-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV32-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV32-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV32-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV32-NEXT: ret -; -; ZVFHMIN-ZFHIN-RV64-LABEL: copysign_neg_trunc_v3f16_v3f32: -; ZVFHMIN-ZFHIN-RV64: # %bb.0: -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, -16 -; ZVFHMIN-ZFHIN-RV64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vle16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: vle32.v v9, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: mv a1, sp -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a1) -; ZVFHMIN-ZFHIN-RV64-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a1, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: addi a2, sp, 8 -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a2) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 2(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: addiw a3, a1, -1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a4, 10(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lui a5, 1048568 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a6, 0(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a7, 8(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a2, a2, a4 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a6, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a7, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a4, 4(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 12(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: vmv.v.x v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a2 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a4, a4, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: lhu a2, 6(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: lh a6, 14(sp) -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a4, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a2, a2, a3 -; ZVFHMIN-ZFHIN-RV64-NEXT: and a1, a6, a5 -; ZVFHMIN-ZFHIN-RV64-NEXT: or a1, a2, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vslide1down.vx v8, v8, a1 -; ZVFHMIN-ZFHIN-RV64-NEXT: vsetivli zero, 3, e16, mf4, ta, ma -; ZVFHMIN-ZFHIN-RV64-NEXT: vse16.v v8, (a0) -; ZVFHMIN-ZFHIN-RV64-NEXT: addi sp, sp, 16 -; ZVFHMIN-ZFHIN-RV64-NEXT: ret +; ZVFHMIN-LABEL: copysign_neg_trunc_v3f16_v3f32: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle32.v v9, (a1) +; ZVFHMIN-NEXT: lui a1, 8 +; ZVFHMIN-NEXT: addi a2, a1, -1 +; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vand.vx v8, v8, a2 +; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v9 +; ZVFHMIN-NEXT: vxor.vx v9, v10, a1 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vsetivli zero, 3, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 +; ZVFHMIN-NEXT: vse16.v v8, (a0) +; ZVFHMIN-NEXT: ret %a = load <3 x half>, ptr %x %b = load <3 x float>, ptr %y %c = fneg <3 x float> %b @@ -5954,3 +4170,10 @@ define void @fnmadd_fmuladd_v2f64(ptr %x, ptr %y, ptr %z) { store <2 x double> %d, ptr %x ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ZVFHMIN-RV32: {{.*}} +; ZVFHMIN-RV64: {{.*}} +; ZVFHMIN-ZFH-RV32: {{.*}} +; ZVFHMIN-ZFH-RV64: {{.*}} +; ZVFHMIN-ZFHIN-RV32: {{.*}} +; ZVFHMIN-ZFHIN-RV64: {{.*}} diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index d747da76a45fa..ad8921d2f7b02 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16 +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,FULLFP16 +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,MVEFP define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: @@ -1091,107 +1091,123 @@ entry: } define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x float> %src2) { -; CHECK-LABEL: copysign_float32_t: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r12, r1, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: lsrs r1, r1, #31 -; CHECK-NEXT: bfi r0, r1, #31, #1 -; CHECK-NEXT: lsrs r1, r2, #31 -; CHECK-NEXT: bfi r4, r1, #31, #1 -; CHECK-NEXT: lsr.w r1, lr, #31 -; CHECK-NEXT: bfi r5, r1, #31, #1 -; CHECK-NEXT: lsr.w r1, r12, #31 -; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: vmov s2, r4 -; CHECK-NEXT: vmov s3, r5 -; CHECK-NEXT: vmov s1, r0 -; CHECK-NEXT: vmov s0, r3 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; FULLFP16-LABEL: copysign_float32_t: +; FULLFP16: @ %bb.0: @ %entry +; FULLFP16-NEXT: .save {r4, r5, r7, lr} +; FULLFP16-NEXT: push {r4, r5, r7, lr} +; FULLFP16-NEXT: vmov r12, r1, d2 +; FULLFP16-NEXT: vmov r2, lr, d3 +; FULLFP16-NEXT: vmov r3, r0, d0 +; FULLFP16-NEXT: vmov r4, r5, d1 +; FULLFP16-NEXT: lsrs r1, r1, #31 +; FULLFP16-NEXT: bfi r0, r1, #31, #1 +; FULLFP16-NEXT: lsrs r1, r2, #31 +; FULLFP16-NEXT: bfi r4, r1, #31, #1 +; FULLFP16-NEXT: lsr.w r1, lr, #31 +; FULLFP16-NEXT: bfi r5, r1, #31, #1 +; FULLFP16-NEXT: lsr.w r1, r12, #31 +; FULLFP16-NEXT: bfi r3, r1, #31, #1 +; FULLFP16-NEXT: vmov s2, r4 +; FULLFP16-NEXT: vmov s3, r5 +; FULLFP16-NEXT: vmov s1, r0 +; FULLFP16-NEXT: vmov s0, r3 +; FULLFP16-NEXT: pop {r4, r5, r7, pc} +; +; MVEFP-LABEL: copysign_float32_t: +; MVEFP: @ %bb.0: @ %entry +; MVEFP-NEXT: vmov.i32 q2, #0x80000000 +; MVEFP-NEXT: vbic.i32 q0, #0x80000000 +; MVEFP-NEXT: vand q1, q1, q2 +; MVEFP-NEXT: vorr q0, q0, q1 +; MVEFP-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %src1, <4 x float> %src2) ret <4 x float> %0 } define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x half> %src2) { -; CHECK-LABEL: copysign_float16_t: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vstr.16 s8, [sp, #24] -; CHECK-NEXT: vstr.16 s4, [sp, #28] -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vstr.16 s4, [sp, #16] -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vstr.16 s5, [sp, #20] -; CHECK-NEXT: vstr.16 s4, [sp, #8] -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vstr.16 s6, [sp, #12] -; CHECK-NEXT: vstr.16 s4, [sp] -; CHECK-NEXT: vstr.16 s7, [sp, #4] -; CHECK-NEXT: ldrb.w r0, [sp, #25] -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vneg.f16 s6, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s6, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #29] -; CHECK-NEXT: vabs.f16 s4, s0 -; CHECK-NEXT: vneg.f16 s0, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s0, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #17] -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vins.f16 s0, s6 -; CHECK-NEXT: vneg.f16 s6, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s6, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #21] -; CHECK-NEXT: vabs.f16 s4, s1 -; CHECK-NEXT: vneg.f16 s1, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s1, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #9] -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vins.f16 s1, s6 -; CHECK-NEXT: vneg.f16 s6, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s6, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #13] -; CHECK-NEXT: vabs.f16 s4, s2 -; CHECK-NEXT: vneg.f16 s2, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s2, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #1] -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vneg.f16 s6, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s6, s4 -; CHECK-NEXT: ldrb.w r0, [sp, #5] -; CHECK-NEXT: vabs.f16 s4, s3 -; CHECK-NEXT: vneg.f16 s3, s4 -; CHECK-NEXT: lsls r0, r0, #24 -; CHECK-NEXT: it pl -; CHECK-NEXT: vmovpl.f32 s3, s4 -; CHECK-NEXT: vins.f16 s3, s6 -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: bx lr +; FULLFP16-LABEL: copysign_float16_t: +; FULLFP16: @ %bb.0: @ %entry +; FULLFP16-NEXT: .pad #32 +; FULLFP16-NEXT: sub sp, #32 +; FULLFP16-NEXT: vmovx.f16 s8, s4 +; FULLFP16-NEXT: vstr.16 s8, [sp, #24] +; FULLFP16-NEXT: vstr.16 s4, [sp, #28] +; FULLFP16-NEXT: vmovx.f16 s4, s5 +; FULLFP16-NEXT: vstr.16 s4, [sp, #16] +; FULLFP16-NEXT: vmovx.f16 s4, s6 +; FULLFP16-NEXT: vstr.16 s5, [sp, #20] +; FULLFP16-NEXT: vstr.16 s4, [sp, #8] +; FULLFP16-NEXT: vmovx.f16 s4, s7 +; FULLFP16-NEXT: vstr.16 s6, [sp, #12] +; FULLFP16-NEXT: vstr.16 s4, [sp] +; FULLFP16-NEXT: vstr.16 s7, [sp, #4] +; FULLFP16-NEXT: ldrb.w r0, [sp, #25] +; FULLFP16-NEXT: vmovx.f16 s4, s0 +; FULLFP16-NEXT: vabs.f16 s4, s4 +; FULLFP16-NEXT: vneg.f16 s6, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s6, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #29] +; FULLFP16-NEXT: vabs.f16 s4, s0 +; FULLFP16-NEXT: vneg.f16 s0, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s0, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #17] +; FULLFP16-NEXT: vmovx.f16 s4, s1 +; FULLFP16-NEXT: vabs.f16 s4, s4 +; FULLFP16-NEXT: vins.f16 s0, s6 +; FULLFP16-NEXT: vneg.f16 s6, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s6, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #21] +; FULLFP16-NEXT: vabs.f16 s4, s1 +; FULLFP16-NEXT: vneg.f16 s1, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s1, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #9] +; FULLFP16-NEXT: vmovx.f16 s4, s2 +; FULLFP16-NEXT: vabs.f16 s4, s4 +; FULLFP16-NEXT: vins.f16 s1, s6 +; FULLFP16-NEXT: vneg.f16 s6, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s6, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #13] +; FULLFP16-NEXT: vabs.f16 s4, s2 +; FULLFP16-NEXT: vneg.f16 s2, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s2, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #1] +; FULLFP16-NEXT: vmovx.f16 s4, s3 +; FULLFP16-NEXT: vabs.f16 s4, s4 +; FULLFP16-NEXT: vins.f16 s2, s6 +; FULLFP16-NEXT: vneg.f16 s6, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s6, s4 +; FULLFP16-NEXT: ldrb.w r0, [sp, #5] +; FULLFP16-NEXT: vabs.f16 s4, s3 +; FULLFP16-NEXT: vneg.f16 s3, s4 +; FULLFP16-NEXT: lsls r0, r0, #24 +; FULLFP16-NEXT: it pl +; FULLFP16-NEXT: vmovpl.f32 s3, s4 +; FULLFP16-NEXT: vins.f16 s3, s6 +; FULLFP16-NEXT: add sp, #32 +; FULLFP16-NEXT: bx lr +; +; MVEFP-LABEL: copysign_float16_t: +; MVEFP: @ %bb.0: @ %entry +; MVEFP-NEXT: vmov.i16 q2, #0x8000 +; MVEFP-NEXT: vbic.i16 q0, #0x8000 +; MVEFP-NEXT: vand q1, q1, q2 +; MVEFP-NEXT: vorr q0, q0, q1 +; MVEFP-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.copysign.v8f16(<8 x half> %src1, <8 x half> %src2) ret <8 x half> %0 diff --git a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll index 1d6e073271efa..4660e1bce1ee6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll @@ -354,7 +354,11 @@ define <2 x i64> @rotr_v2i64(<2 x i64> %x, <2 x i64> %y) { ; ============================================================================== ; CHECK-LABEL: copysign_v4f32: -; CHECK: f32.copysign +; CHECK: v128.const +; CHECK-NEXT: v128.and +; CHECK-NEXT: v128.const +; CHECK-NEXT: v128.and +; CHECK-NEXT: v128.or declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) define <4 x float> @copysign_v4f32(<4 x float> %x, <4 x float> %y) { %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y) @@ -454,7 +458,11 @@ define <4 x float> @round_v4f32(<4 x float> %x) { ; ============================================================================== ; CHECK-LABEL: copysign_v2f64: -; CHECK: f64.copysign +; CHECK: v128.const +; CHECK-NEXT: v128.and +; CHECK-NEXT: v128.const +; CHECK-NEXT: v128.and +; CHECK-NEXT: v128.or declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) define <2 x double> @copysign_v2f64(<2 x double> %x, <2 x double> %y) { %v = call <2 x double> @llvm.copysign.v2f64(<2 x double> %x, <2 x double> %y) From df3f291d2a64bb01bc8fab69f296923c1c798909 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 30 Sep 2024 19:54:03 +0100 Subject: [PATCH 013/151] [TBAA] Add tests with pointers to structs to tbaa-pointers.c. Precommit tests for follow-up improvements to Clang's TBAA emission. Also add variants with -pointer-tbaa to tbaa-reference.cpp. --- clang/test/CodeGen/tbaa-pointers.c | 40 +++++++++++++++++++++++---- clang/test/CodeGen/tbaa-reference.cpp | 2 ++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c index 75d8c3d501750..8860b7042d0a2 100644 --- a/clang/test/CodeGen/tbaa-pointers.c +++ b/clang/test/CodeGen/tbaa-pointers.c @@ -116,17 +116,43 @@ void p2struct(struct S1 **ptr) { // COMMON-LABEL: define void @p2struct( // COMMON-SAME: ptr noundef [[PTR:%.+]]) // COMMON: [[PTR_ADDR:%.+]] = alloca ptr, align 8 - // ENABLED-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0:!.+]] - // ENABLED-NEXT: [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_0]] - // ENABLED-NEXT: store ptr null, ptr [[BASE]], align 8, !tbaa [[P1S1_:!.+]] - // DEFAULT-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] - // DEFAULT-NEXT: [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] - // DEFAULT-NEXT: store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]] + // ENABLED-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]] + // DEFAULT-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: ret void + // + *ptr = 0; +} + +void p2struct_const(struct S1 const **ptr) { + // COMMON-LABEL: define void @p2struct_const( + // COMMON-SAME: ptr noundef [[PTR:%.+]]) + // COMMON: [[PTR_ADDR:%.+]] = alloca ptr, align 8 + // COMMON-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]] // COMMON-NEXT: ret void // *ptr = 0; } +struct S2 { + struct S1 *s; +}; + +void p2struct2(struct S2 *ptr) { + // COMMON-LABEL: define void @p2struct2( + // COMMON-SAME: ptr noundef [[PTR:%.+]]) + // COMMON: [[PTR_ADDR:%.+]] = alloca ptr, align 8 + // COMMON-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]] + // COMMON-NEXT: [[S:%.+]] = getelementptr inbounds nuw %struct.S2, ptr [[BASE]], i32 0, i32 0 + // COMMON-NEXT: store ptr null, ptr [[S]], align 8, !tbaa [[S2_S_TAG:!.+]] + // COMMON-NEXT: ret void + ptr->s = 0; +} + // ENABLED: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0} // ENABLED: [[P2INT]] = !{!"p2 int", [[ANY_POINTER:!.+]], i64 0} // DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0} @@ -145,3 +171,5 @@ void p2struct(struct S1 **ptr) { // ENABLED: [[P2CHAR]] = !{!"p2 omnipotent char", [[ANY_POINTER]], i64 0} // ENABLED: [[P1CHAR_0]] = !{[[P1CHAR:!.+]], [[P1CHAR]], i64 0} // ENABLED: [[P1CHAR]] = !{!"p1 omnipotent char", [[ANY_POINTER]], i64 0} +// COMMON: [[S2_S_TAG]] = !{[[S2_TY:!.+]], [[ANY_POINTER]], i64 0} +// COMMON: [[S2_TY]] = !{!"S2", [[ANY_POINTER]], i64 0} diff --git a/clang/test/CodeGen/tbaa-reference.cpp b/clang/test/CodeGen/tbaa-reference.cpp index c4d9e70a8b07f..d22cd90b43ae9 100644 --- a/clang/test/CodeGen/tbaa-reference.cpp +++ b/clang/test/CodeGen/tbaa-reference.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,OLD-PATH +// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes -pointer-tbaa %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,OLD-PATH // RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH +// RUN: %clang_cc1 -triple x86_64-linux -O1 -disable-llvm-passes %s -pointer-tbaa -emit-llvm -new-struct-path-tbaa -o - | FileCheck %s -check-prefixes=CHECK,NEW-PATH // // Check that we generate correct TBAA information for reference accesses. From 023f7c9382599111244e682ea6b26011dbf7fc56 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 30 Sep 2024 21:19:32 +0200 Subject: [PATCH 014/151] [mlir][Transforms][NFC] Dialect Conversion: Update docs for `remapValues` (#110414) Simplify the nesting structure of "if" checks in `remapValues` and update the code comments. This is what the comments stated in case there is no type converter: ``` // TODO: What we should do here is just set `desiredType` to `origType` // and then handle the necessary type conversions after the conversion // process has finished. Unfortunately a lot of patterns currently rely on // receiving the new operands even if the types change, so we keep the // original behavior here for now until all of the patterns relying on // this get updated. ``` However, without a type converter it is not possible to perform any materializations. Furthermore, the absence of a type converter indicates that the pattern does not care about type legality. Therefore, the current implementation is correct and this TODO can be removed. Note: Patterns that actually require a remapped type to match the original operand type can be equipped with a type converter that maps each type to itself. This TODO is outdated: ``` // TODO: There currently isn't any mechanism to do 1->N type conversion // via the PatternRewriter replacement API, so for now we just ignore it. ``` 1->N type conversions are already possible as part of block signature conversions. It is incorrect to just ignore such cases. However, there is currently no better way to handle 1->N conversions in this function because of infrastructure limitations. This is now clarified in the comments. --- .../Transforms/Utils/DialectConversion.cpp | 70 ++++++++++--------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 4693edadfb5ee..b5aab2416c3eb 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1092,44 +1092,50 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( SmallVectorImpl &remapped) { remapped.reserve(llvm::size(values)); - SmallVector legalTypes; for (const auto &it : llvm::enumerate(values)) { Value operand = it.value(); Type origType = operand.getType(); + Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); - // If a converter was provided, get the desired legal types for this - // operand. - Type desiredType; - if (currentTypeConverter) { - // If there is no legal conversion, fail to match this pattern. - legalTypes.clear(); - if (failed(currentTypeConverter->convertType(origType, legalTypes))) { - Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); - notifyMatchFailure(operandLoc, [=](Diagnostic &diag) { - diag << "unable to convert type for " << valueDiagTag << " #" - << it.index() << ", type was " << origType; - }); - return failure(); - } - // TODO: There currently isn't any mechanism to do 1->N type conversion - // via the PatternRewriter replacement API, so for now we just ignore it. - if (legalTypes.size() == 1) - desiredType = legalTypes.front(); - } else { - // TODO: What we should do here is just set `desiredType` to `origType` - // and then handle the necessary type conversions after the conversion - // process has finished. Unfortunately a lot of patterns currently rely on - // receiving the new operands even if the types change, so we keep the - // original behavior here for now until all of the patterns relying on - // this get updated. + if (!currentTypeConverter) { + // The current pattern does not have a type converter. I.e., it does not + // distinguish between legal and illegal types. For each operand, simply + // pass through the most recently mapped value. + remapped.push_back(mapping.lookupOrDefault(operand)); + continue; + } + + // If there is no legal conversion, fail to match this pattern. + SmallVector legalTypes; + if (failed(currentTypeConverter->convertType(origType, legalTypes))) { + notifyMatchFailure(operandLoc, [=](Diagnostic &diag) { + diag << "unable to convert type for " << valueDiagTag << " #" + << it.index() << ", type was " << origType; + }); + return failure(); } - Value newOperand = mapping.lookupOrDefault(operand, desiredType); - // Handle the case where the conversion was 1->1 and the new operand type - // isn't legal. - Type newOperandType = newOperand.getType(); - if (currentTypeConverter && desiredType && newOperandType != desiredType) { - Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); + if (legalTypes.size() != 1) { + // TODO: Parts of the dialect conversion infrastructure do not support + // 1->N type conversions yet. Therefore, if a type is converted to 0 or + // multiple types, the only thing that we can do for now is passing + // through the most recently mapped value. Fixing this requires + // improvements to the `ConversionValueMapping` (to be able to store 1:N + // mappings) and to the `ConversionPattern` adaptor handling (to be able + // to pass multiple remapped values for a single operand to the adaptor). + remapped.push_back(mapping.lookupOrDefault(operand)); + continue; + } + + // Handle 1->1 type conversions. + Type desiredType = legalTypes.front(); + // Try to find a mapped value with the desired type. (Or the operand itself + // if the value is not mapped at all.) + Value newOperand = mapping.lookupOrDefault(operand, desiredType); + if (newOperand.getType() != desiredType) { + // If the looked up value's type does not have the desired type, it means + // that the value was replaced with a value of different type and no + // source materialization was created yet. Value castValue = buildUnresolvedMaterialization( MaterializationKind::Target, computeInsertPoint(newOperand), operandLoc, /*inputs=*/newOperand, /*outputType=*/desiredType, From 49df12c01e99af6e091fedc123f775580064740a Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 30 Sep 2024 21:20:48 +0200 Subject: [PATCH 015/151] [mlir][NFC] Minor cleanup around `ModuleOp` usage (#110498) Use `moduleOp.getBody()` instead of `moduleOp.getBodyRegion().front()`. --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 3 +-- .../BufferizationToMemRef.cpp | 3 +-- .../Conversion/GPUCommon/GPUOpsLowering.cpp | 20 ++++++++----------- .../Transforms/LowerDeallocations.cpp | 3 +-- .../Transforms/SparseGPUCodegen.cpp | 4 ++-- 5 files changed, 13 insertions(+), 20 deletions(-) diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 5f4138e0f63e7..23a171c657638 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -478,8 +478,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, return existingPrivatizer; mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(), - moduleOp.getBodyRegion().front().begin()); + firOpBuilder.setInsertionPointToStart(moduleOp.getBody()); auto result = firOpBuilder.create( symLoc, uniquePrivatizerName, symType, isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate diff --git a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp index f9903071be084..06aedc5e139d3 100644 --- a/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp +++ b/mlir/lib/Conversion/BufferizationToMemRef/BufferizationToMemRef.cpp @@ -134,8 +134,7 @@ struct BufferizationToMemRefPass bufferization::DeallocHelperMap deallocHelperFuncMap; if (auto module = dyn_cast(getOperation())) { - OpBuilder builder = - OpBuilder::atBlockBegin(&module.getBodyRegion().front()); + OpBuilder builder = OpBuilder::atBlockBegin(module.getBody()); // Build dealloc helper function if there are deallocs. getOperation()->walk([&](bufferization::DeallocOp deallocOp) { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 5b590a457f771..40558a0822441 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -642,11 +642,10 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) { /// Generates a symbol with 0-sized array type for dynamic shared memory usage, /// or uses existing symbol. -LLVM::GlobalOp -getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter, - Operation *moduleOp, gpu::DynamicSharedMemoryOp op, - const LLVMTypeConverter *typeConverter, - MemRefType memrefType, unsigned alignmentBit) { +LLVM::GlobalOp getDynamicSharedMemorySymbol( + ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp, + gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter, + MemRefType memrefType, unsigned alignmentBit) { uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth(); FailureOr addressSpace = @@ -661,8 +660,7 @@ getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter, // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of // LLVM::GlobalOp is suitable for shared memory, return it. llvm::StringSet<> existingGlobalNames; - for (auto globalOp : - moduleOp->getRegion(0).front().getOps()) { + for (auto globalOp : moduleOp.getBody()->getOps()) { existingGlobalNames.insert(globalOp.getSymName()); if (auto arrayType = dyn_cast(globalOp.getType())) { if (globalOp.getAddrSpace() == addressSpace.value() && @@ -684,7 +682,7 @@ getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter, // Step 3. Generate a global op OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPoint(&moduleOp->getRegion(0).front().front()); + rewriter.setInsertionPointToStart(moduleOp.getBody()); auto zeroSizedArrayType = LLVM::LLVMArrayType::get( typeConverter->convertType(memrefType.getElementType()), 0); @@ -709,10 +707,8 @@ LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite( // Step 2: Generate a global symbol or existing for the dynamic shared // memory with memref<0xi8> type - LLVM::LLVMFuncOp funcOp = op->getParentOfType(); - LLVM::GlobalOp shmemOp = {}; - Operation *moduleOp = funcOp->getParentWithTrait(); - shmemOp = getDynamicSharedMemorySymbol( + auto moduleOp = op->getParentOfType(); + LLVM::GlobalOp shmemOp = getDynamicSharedMemorySymbol( rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit); // Step 3. Get address of the global symbol diff --git a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp index 9e2c91bad7bfd..31d165ce15407 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp @@ -391,8 +391,7 @@ struct LowerDeallocationsPass bufferization::DeallocHelperMap deallocHelperFuncMap; if (auto module = dyn_cast(getOperation())) { - OpBuilder builder = - OpBuilder::atBlockBegin(&module.getBodyRegion().front()); + OpBuilder builder = OpBuilder::atBlockBegin(module.getBody()); // Build dealloc helper function if there are deallocs. getOperation()->walk([&](bufferization::DeallocOp deallocOp) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index 8be76cac87f29..b7fac163ba5fe 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -58,7 +58,7 @@ static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) { for (auto op : topModule.getBodyRegion().getOps()) return op; // existing markAsGPUContainer(topModule); - builder.setInsertionPointToStart(&topModule.getBodyRegion().front()); + builder.setInsertionPointToStart(topModule.getBody()); return builder.create(topModule->getLoc(), "sparse_kernels"); } @@ -75,7 +75,7 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule, ("kernel" + Twine(kernelNumber++)).toStringRef(kernelName); } while (gpuModule.lookupSymbol(kernelName)); // Then we insert a new kernel with given arguments into the module. - builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front()); + builder.setInsertionPointToStart(gpuModule.getBody()); SmallVector argsTp; for (auto arg : args) argsTp.push_back(arg.getType()); From 1094ee71da533929cebb7ce98fd2665c924387a7 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Mon, 30 Sep 2024 20:31:08 +0100 Subject: [PATCH 016/151] [flang][debug] Better handle array lower bound of assumed shape arrays. (#110302) As mentioned in #108633, we don't respect the lower bound of the assumed shape arrays if those were specified. It happens in both cases: 1. When caller has non-default lower bound and callee has default 2. When callee has non-default lower bound and caller has default This PR tries to fix this issue by improving our generation of lower bound attribute on DICompositeTypeAttr. If we see a lower bound in the declaration, we respect that. Note that same function is also used for allocatable/pointer variables. We make sure that we get the lower bound from descriptor in those cases. Please note that DWARF assumes a lower bound of 1 so in many cases we don't need to generate the lower bound. Fixes #108633. --- .../Transforms/DebugTypeGenerator.cpp | 33 ++++++++++++++----- .../test/Integration/debug-allocatable-1.f90 | 4 +-- .../Integration/debug-assumed-shape-array.f90 | 17 +++++++--- flang/test/Integration/debug-ptr-type.f90 | 4 ++- .../Transforms/debug-assumed-shape-array.fir | 17 +++++++--- 5 files changed, 55 insertions(+), 20 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index e52812fb320cb..29e61d505bf6a 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -113,10 +113,21 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( mlir::LLVM::DITypeAttr elemTy = convertType(seqTy.getEleTy(), fileAttr, scope, declOp); unsigned offset = dimsOffset; + unsigned index = 0; + mlir::IntegerType intTy = mlir::IntegerType::get(context, 64); const unsigned indexSize = dimsSize / 3; for ([[maybe_unused]] auto _ : seqTy.getShape()) { // For each dimension, find the offset of count, lower bound and stride in // the descriptor and generate the dwarf expression to extract it. + mlir::Attribute lowerAttr = nullptr; + // If declaration has a lower bound, use it. + if (declOp && declOp.getShift().size() > index) { + // TODO: Handle case where lower bound is a variable (instead of a + // constant as handled here) + if (std::optional optint = + getIntIfConstant(declOp.getShift()[index])) + lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, *optint)); + } // FIXME: If `indexSize` happens to be bigger than address size on the // system then we may have to change 'DW_OP_deref' here. addOp(llvm::dwarf::DW_OP_push_object_address, {}); @@ -129,14 +140,19 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( mlir::LLVM::DIExpressionAttr::get(context, ops); ops.clear(); - addOp(llvm::dwarf::DW_OP_push_object_address, {}); - addOp(llvm::dwarf::DW_OP_plus_uconst, - {offset + (indexSize * kDimLowerBoundPos)}); - addOp(llvm::dwarf::DW_OP_deref, {}); - // lower_bound[i] = *(base_addr + offset + (indexSize * kDimLowerBoundPos)) - mlir::LLVM::DIExpressionAttr lowerAttr = - mlir::LLVM::DIExpressionAttr::get(context, ops); - ops.clear(); + // If a lower bound was not found in the declOp, then we will get them from + // descriptor only for pointer and allocatable case. DWARF assumes lower + // bound of 1 when this attribute is missing. + if (!lowerAttr && (genAllocated || genAssociated)) { + addOp(llvm::dwarf::DW_OP_push_object_address, {}); + addOp(llvm::dwarf::DW_OP_plus_uconst, + {offset + (indexSize * kDimLowerBoundPos)}); + addOp(llvm::dwarf::DW_OP_deref, {}); + // lower_bound[i] = *(base_addr + offset + (indexSize * + // kDimLowerBoundPos)) + lowerAttr = mlir::LLVM::DIExpressionAttr::get(context, ops); + ops.clear(); + } addOp(llvm::dwarf::DW_OP_push_object_address, {}); addOp(llvm::dwarf::DW_OP_plus_uconst, @@ -151,6 +167,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get( context, countAttr, lowerAttr, /*upperBound=*/nullptr, strideAttr); elements.push_back(subrangeTy); + ++index; } return mlir::LLVM::DICompositeTypeAttr::get( context, llvm::dwarf::DW_TAG_array_type, /*name=*/nullptr, diff --git a/flang/test/Integration/debug-allocatable-1.f90 b/flang/test/Integration/debug-allocatable-1.f90 index 471c8cdb7d54e..b9de3b26cdf98 100644 --- a/flang/test/Integration/debug-allocatable-1.f90 +++ b/flang/test/Integration/debug-allocatable-1.f90 @@ -17,8 +17,8 @@ end subroutine ff ! CHECK-DAG: !DILocalVariable(name: "ar1"{{.*}}type: ![[TY1:[0-9]+]]) ! CHECK-DAG: ![[TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]]{{.*}}dataLocation{{.*}}allocated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne)) ! CHECK-DAG: ![[ELEMS2]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]} -! CHECK-DAG: ![[ELEM1]] = !DISubrange -! CHECK-DAG: ![[ELEM2]] = !DISubrange +! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref)) +! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref)) ! CHECK-DAG: !DILocalVariable(name: "sc"{{.*}}type: ![[TY2:[0-9]+]]) ! CHECK-DAG: ![[TY2]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[TY3:[0-9]+]]{{.*}}) ! CHECK-DAG: ![[TY3]] = !DIBasicType(name: "real"{{.*}}) diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90 index 9a439e20d1981..bdfbf34ca4d15 100644 --- a/flang/test/Integration/debug-assumed-shape-array.f90 +++ b/flang/test/Integration/debug-assumed-shape-array.f90 @@ -1,13 +1,20 @@ ! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s -subroutine ff(arr) +subroutine ff(arr, arr1) implicit none integer :: arr(:, :) - return arr(1,1) + integer :: arr1(3:, 4:) + return arr(1,1) + arr1(3,4) end subroutine ff -! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref)) +! CHECK-DAG: !DILocalVariable(name: "arr"{{.*}}type: ![[TY1:[0-9]+]]{{.*}}) +! CHECK-DAG: ![[TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref)) ! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]} -! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref)) -! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref)) +! CHECK-DAG: ![[ELEM1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref)) +! CHECK-DAG: ![[ELEM2]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref)) +! CHECK-DAG: !DILocalVariable(name: "arr1"{{.*}}type: ![[TY2:[0-9]+]]{{.*}}) +! CHECK-DAG: ![[TY2]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS1:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref)) +! CHECK-DAG: ![[ELEMS1]] = !{![[ELEM11:[0-9]+]], ![[ELEM12:[0-9]+]]} +! CHECK-DAG: ![[ELEM11]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref), lowerBound: 3, stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 40, DW_OP_deref)) +! CHECK-DAG: ![[ELEM12]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref), lowerBound: 4, stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 64, DW_OP_deref)) diff --git a/flang/test/Integration/debug-ptr-type.f90 b/flang/test/Integration/debug-ptr-type.f90 index bff7bcb862b5c..6d7178f7aca41 100644 --- a/flang/test/Integration/debug-ptr-type.f90 +++ b/flang/test/Integration/debug-ptr-type.f90 @@ -41,7 +41,9 @@ end subroutine ff ! CHECK-DAG: ![[ELEMS1:[0-9]+]] = !{!{{[0-9]+}}} ! CHECK-DAG: !DILocalVariable(name: "par"{{.*}}type: ![[ARR_TY1:[0-9]+]]) ! CHECK-DAG: ![[ARR_TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne)) -! CHECK-DAG: ![[ELEMS2]] = !{!{{[0-9]+}}, !{{[0-9]+}}} +! CHECK-DAG: ![[ELEMS2]] = !{![[ELEM21:[0-9]+]], ![[ELEM22:[0-9]+]]} +! CHECK-DAG: ![[ELEM21]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref)) +! CHECK-DAG: ![[ELEM22]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref)) ! CHECK-DAG: !DILocalVariable(name: "par2"{{.*}}type: ![[ARR_TY2:[0-9]+]]) ! CHECK-DAG: ![[ARR_TY2]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}, elements: ![[ELEMS1]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref), associated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne)) ! CHECK-DAG: !DILocalVariable(name: "psc"{{.*}}type: ![[PTR_TY:[0-9]+]]) diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir index d1e64297acea7..cb3927a7d79cf 100644 --- a/flang/test/Transforms/debug-assumed-shape-array.fir +++ b/flang/test/Transforms/debug-assumed-shape-array.fir @@ -1,16 +1,25 @@ // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} { - func.func @ff_(%arg0: !fir.box> {fir.bindc_name = "arr"} ) { + func.func @ff_(%arg0: !fir.box> {fir.bindc_name = "arr"}, %arg1: !fir.box> {fir.bindc_name = "arr1"}) { + %c4 = arith.constant 4 : index + %c3 = arith.constant 3 : index %0 = fir.undefined !fir.dscope %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFffEarr"} : (!fir.box>, !fir.dscope) -> !fir.box> loc(#loc1) + %2 = fircg.ext_declare %arg1 origin %c3, %c4 dummy_scope %0 {uniq_name = "_QFffEarr1"} : (!fir.box>, index, index, !fir.dscope) -> !fir.box> loc(#loc3) return } loc(#loc2) } #loc1 = loc("test1.f90":1:1) #loc2 = loc("test1.f90":3:16) +#loc3 = loc("test1.f90":4:16) -// CHECK: #llvm.di_composite_type, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>, -// CHECK-SAME: #llvm.di_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>> +// CHECK: #[[TY1:.*]] = #llvm.di_composite_type, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>, +// CHECK-SAME: #llvm.di_subrange, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>> // CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>> +// CHECK: #[[TY2:.*]] = #llvm.di_composite_type, lowerBound = 3 : i64, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(40), DW_OP_deref]>>, +// CHECK-SAME: #llvm.di_subrange, lowerBound = 4 : i64, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(64), DW_OP_deref]>>, dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>> +// CHECK: #llvm.di_local_variable<{{.*}}name = "arr"{{.*}}type = #[[TY1]]> +// CHECK: #llvm.di_local_variable<{{.*}}name = "arr1"{{.*}}type = #[[TY2]]> From b5aea32920ee6874bbdc7d6414039adce1b6c19a Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Mon, 30 Sep 2024 12:37:23 -0700 Subject: [PATCH 017/151] [flang] Improve error recovery for bad statement after CONTAINS (#109698) After a CONTAINS statement in a program unit, a statement that cannot begin a subprogram will trigger catastrophic error recovery. But the compiler is presently emitting multiple errors for the same location about expected variations of END statements. Emit fewer messages. Fixes https://github.com/llvm/llvm-project/issues/109609. --- flang/lib/Parser/program-parsers.cpp | 26 +++++++----- flang/lib/Parser/stmt-parser.h | 1 - flang/test/Parser/recovery06.f90 | 62 ++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 12 deletions(-) create mode 100644 flang/test/Parser/recovery06.f90 diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp index c43696c52c160..2b7da18a09bb3 100644 --- a/flang/lib/Parser/program-parsers.cpp +++ b/flang/lib/Parser/program-parsers.cpp @@ -249,8 +249,9 @@ TYPE_CONTEXT_PARSER("PROGRAM statement"_en_US, // R1403 end-program-stmt -> END [PROGRAM [program-name]] TYPE_CONTEXT_PARSER("END PROGRAM statement"_en_US, - construct(recovery( - "END PROGRAM" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery))) + construct( + recovery("END" >> defaulted("PROGRAM" >> maybe(name)) / atEndOfStmt, + progUnitEndStmtErrorRecovery))) // R1404 module -> // module-stmt [specification-part] [module-subprogram-part] @@ -266,8 +267,9 @@ TYPE_CONTEXT_PARSER( // R1406 end-module-stmt -> END [MODULE [module-name]] TYPE_CONTEXT_PARSER("END MODULE statement"_en_US, - construct(recovery( - "END MODULE" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery))) + construct( + recovery("END" >> defaulted("MODULE" >> maybe(name)) / atEndOfStmt, + progUnitEndStmtErrorRecovery))) // R1407 module-subprogram-part -> contains-stmt [module-subprogram]... TYPE_CONTEXT_PARSER("module subprogram part"_en_US, @@ -334,7 +336,7 @@ TYPE_PARSER(construct(name, maybe(":" >> name))) // R1419 end-submodule-stmt -> END [SUBMODULE [submodule-name]] TYPE_CONTEXT_PARSER("END SUBMODULE statement"_en_US, construct( - recovery("END SUBMODULE" >> maybe(name) || bareEnd, + recovery("END" >> defaulted("SUBMODULE" >> maybe(name)) / atEndOfStmt, progUnitEndStmtErrorRecovery))) // R1420 block-data -> block-data-stmt [specification-part] end-block-data-stmt @@ -350,7 +352,7 @@ TYPE_CONTEXT_PARSER("BLOCK DATA statement"_en_US, // R1422 end-block-data-stmt -> END [BLOCK DATA [block-data-name]] TYPE_CONTEXT_PARSER("END BLOCK DATA statement"_en_US, construct( - recovery("END BLOCK DATA" >> maybe(name) || bareEnd, + recovery("END" >> defaulted("BLOCK DATA" >> maybe(name)) / atEndOfStmt, progUnitEndStmtErrorRecovery))) // R1501 interface-block -> @@ -564,8 +566,9 @@ TYPE_PARSER(construct( "RESULT" >> parenthesized(name), maybe(languageBindingSpec))) // R1533 end-function-stmt -> END [FUNCTION [function-name]] -TYPE_PARSER(construct(recovery( - "END FUNCTION" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery))) +TYPE_PARSER(construct( + recovery("END" >> defaulted("FUNCTION" >> maybe(name)) / atEndOfStmt, + progUnitEndStmtErrorRecovery))) // R1534 subroutine-subprogram -> // subroutine-stmt [specification-part] [execution-part] @@ -591,8 +594,9 @@ TYPE_PARSER( TYPE_PARSER(construct(name) || construct(star)) // R1537 end-subroutine-stmt -> END [SUBROUTINE [subroutine-name]] -TYPE_PARSER(construct(recovery( - "END SUBROUTINE" >> maybe(name) || bareEnd, progUnitEndStmtErrorRecovery))) +TYPE_PARSER(construct( + recovery("END" >> defaulted("SUBROUTINE" >> maybe(name)) / atEndOfStmt, + progUnitEndStmtErrorRecovery))) // R1538 separate-module-subprogram -> // mp-subprogram-stmt [specification-part] [execution-part] @@ -609,7 +613,7 @@ TYPE_CONTEXT_PARSER("MODULE PROCEDURE statement"_en_US, // R1540 end-mp-subprogram-stmt -> END [PROCEDURE [procedure-name]] TYPE_CONTEXT_PARSER("END PROCEDURE statement"_en_US, construct( - recovery("END PROCEDURE" >> maybe(name) || bareEnd, + recovery("END" >> defaulted("PROCEDURE" >> maybe(name)) / atEndOfStmt, progUnitEndStmtErrorRecovery))) // R1541 entry-stmt -> ENTRY entry-name [( [dummy-arg-list] ) [suffix]] diff --git a/flang/lib/Parser/stmt-parser.h b/flang/lib/Parser/stmt-parser.h index 00bae2bf950c8..ee45c6fd5d38c 100644 --- a/flang/lib/Parser/stmt-parser.h +++ b/flang/lib/Parser/stmt-parser.h @@ -90,7 +90,6 @@ constexpr auto executionPartErrorRecovery{stmtErrorRecoveryStart >> // END statement error recovery constexpr auto missingOptionalName{pure>()}; constexpr auto noNameEnd{"END" >> missingOptionalName}; -constexpr auto bareEnd{noNameEnd / recovery(atEndOfStmt, SkipTo<'\n'>{})}; // For unrecognizable construct END statements. Be sure to not consume // a program unit's END statement. diff --git a/flang/test/Parser/recovery06.f90 b/flang/test/Parser/recovery06.f90 new file mode 100644 index 0000000000000..4c0214180eb0f --- /dev/null +++ b/flang/test/Parser/recovery06.f90 @@ -0,0 +1,62 @@ +! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s +program p + contains +! CHECK: error: expected 'END' +! CHECK: in the context: END PROGRAM statement + continue +end + +subroutine s + contains +! CHECK: error: expected 'END' +! CHECK: in the context: SUBROUTINE subprogram + continue +end + +function f() + contains +! CHECK: error: expected 'END' +! CHECK: in the context: FUNCTION subprogram + continue +end + +module m + interface + module subroutine ms + end + end interface + contains +! CHECK: error: expected 'END' +! CHECK: in the context: END MODULE statement + continue +end + +module m2 + contains + subroutine m2s + contains +! CHECK: error: expected 'END' +! CHECK: in the context: SUBROUTINE subprogram + continue + end +end + +submodule(m) s1 + contains +! CHECK: error: expected 'END' +! CHECK: in the context: END SUBMODULE statement + continue +end + +submodule(m) s2 + contains + module procedure ms + contains +! CHECK: error: expected 'END' +! CHECK: in the context: END PROCEDURE statement + continue + end +end + +! Ensure no error cascade +! CHECK-NOT: error: From 1759f3b404a5bd73c6412b8a110f850ff1c43c24 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Mon, 30 Sep 2024 12:38:06 -0700 Subject: [PATCH 018/151] [flang] Improve error messages about overflowed integer conversions (#110031) When an INTEGER conversion to a smaller kind overflows in constant folding, report the truncated value so that it makes more sense later if it shows up in other messages. --- flang/lib/Evaluate/fold-implementation.h | 5 +++-- flang/test/Evaluate/errors01.f90 | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index 1b14a305b87f4..89477dfb36435 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -1736,8 +1736,9 @@ Expr FoldOperation( msvcWorkaround.context.languageFeatures().ShouldWarn( common::UsageWarning::FoldingException)) { ctx.messages().Say( - "INTEGER(%d) to INTEGER(%d) conversion overflowed"_warn_en_US, - Operand::kind, TO::kind); + "conversion of %s_%d to INTEGER(%d) overflowed; result is %s"_warn_en_US, + value->SignedDecimal(), Operand::kind, TO::kind, + converted.value.SignedDecimal()); } return ScalarConstantToExpr(std::move(converted.value)); } else if constexpr (FromCat == TypeCategory::Real) { diff --git a/flang/test/Evaluate/errors01.f90 b/flang/test/Evaluate/errors01.f90 index bad73f2e8b160..684f3fc34e687 100644 --- a/flang/test/Evaluate/errors01.f90 +++ b/flang/test/Evaluate/errors01.f90 @@ -192,6 +192,8 @@ subroutine warnings real, parameter :: bad10 = product([huge(1.),huge(1.)]) !CHECK: warning: PRODUCT() of COMPLEX(4) data overflowed complex, parameter :: bad11 = product([(huge(1.),0.),(huge(1.),0.)]) + !CHECK: warning: conversion of 111111111111111111111_16 to INTEGER(8) overflowed; result is 430646668853801415 + integer(8), parameter :: bad12 = int(111111111111111111111, 8) !CHECK: warning: overflow on REAL(8) to REAL(4) conversion x = 1.D40 !CHECK-NOT: warning: invalid argument From 9b3818ecae5a5c47eb6a8dd44cf7e1c3666a0f02 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Mon, 30 Sep 2024 12:38:24 -0700 Subject: [PATCH 019/151] [flang] Downgrade specific format error to warning (#110314) When a format is missing a comma between two edit descriptors, the previous token was an integer, and the following item is a repeatable edit descriptor or a parenthesized group, we emit an error, since it can't be known where the digits of the integer should be split. But in the case of a single digit, the situation is not ambiguous, and the message should be a warning. Fixes https://github.com/llvm/llvm-project/issues/110261. --- flang/include/flang/Common/format.h | 10 ++++++---- flang/test/Semantics/io07.f90 | 11 +++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h index 2374ff6983cf4..67d37bee32ab3 100644 --- a/flang/include/flang/Common/format.h +++ b/flang/include/flang/Common/format.h @@ -136,7 +136,7 @@ template class FormatValidator { const CHAR *cursor_{}; // current location in format_ const CHAR *laCursor_{}; // lookahead cursor - TokenKind previousTokenKind_{TokenKind::None}; + Token previousToken_{}; Token token_{}; // current token Token knrToken_{}; // k, n, or r UnsignedInteger token Token scaleFactorToken_{}; // most recent scale factor token P @@ -193,7 +193,7 @@ template void FormatValidator::NextToken() { // At entry, cursor_ points before the start of the next token. // At exit, cursor_ points to last CHAR of token_. - previousTokenKind_ = token_.kind(); + previousToken_ = token_; CHAR c{NextChar()}; token_.set_kind(TokenKind::None); token_.set_offset(cursor_ - format_); @@ -431,7 +431,7 @@ template void FormatValidator::NextToken() { } SetLength(); if (stmt_ == IoStmtKind::Read && - previousTokenKind_ != TokenKind::DT) { // 13.3.2p6 + previousToken_.kind() != TokenKind::DT) { // 13.3.2p6 ReportError("String edit descriptor in READ format expression"); } else if (token_.kind() != TokenKind::String) { ReportError("Unterminated string"); @@ -887,8 +887,10 @@ template bool FormatValidator::Check() { // Possible first token of the next format item; token not yet processed. if (commaRequired) { const char *s{"Expected ',' or ')' in format expression"}; // C1302 - if (previousTokenKind_ == TokenKind::UnsignedInteger && + if (previousToken_.kind() == TokenKind::UnsignedInteger && + previousToken_.length() > 1 && itemsWithLeadingInts_.test(token_.kind())) { + // F10.32F10.3 is ambiguous, F10.3F10.3 is not ReportError(s); } else { ReportWarning(s); diff --git a/flang/test/Semantics/io07.f90 b/flang/test/Semantics/io07.f90 index 1c13c7df20a31..64a32c9959287 100644 --- a/flang/test/Semantics/io07.f90 +++ b/flang/test/Semantics/io07.f90 @@ -1,4 +1,4 @@ -! RUN: %python %S/test_errors.py %s %flang_fc1 +! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic 1001 format(A) !ERROR: Format statement must be labeled @@ -23,9 +23,13 @@ endif ! C1302 warnings; no errors + !WARNING: Expected ',' or ')' in format expression 2051 format(1X3/) + !WARNING: Expected ',' or ')' in format expression 2052 format(1X003/) + !WARNING: Expected ',' or ')' in format expression 2053 format(3P7I2) + !WARNING: Expected ',' or ')' in format expression 2054 format(3PI2) !ERROR: Expected ',' or ')' in format expression @@ -37,13 +41,14 @@ !ERROR: Expected ',' or ')' in format expression 2103 format(3I8 3Z8) - !ERROR: Expected ',' or ')' in format expression + !WARNING: Expected ',' or ')' in format expression 2104 format(3I8 Z8) 3001 format(*(I3)) 3002 format(5X,*(2(A))) !ERROR: Unlimited format item list must contain a data edit descriptor + !WARNING: 'X' edit descriptor must have a positive position value 3101 format(*(X)) !ERROR: Unlimited format item list must contain a data edit descriptor @@ -52,9 +57,11 @@ !ERROR: Unlimited format item list must contain a data edit descriptor 3103 format(5X, 'abc', *((:))) + !WARNING: 'X' edit descriptor must have a positive position value 4001 format(2(X)) !ERROR: List repeat specifier must be positive + !WARNING: 'X' edit descriptor must have a positive position value !ERROR: 'DT' edit descriptor repeat specifier must be positive 4101 format(0(X), 0dt) From 4dfed691a9f846b1ff773e28b878404b78559890 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Mon, 30 Sep 2024 12:38:47 -0700 Subject: [PATCH 020/151] [flang][preprocessor] Don't expand INCLUDE under -E by default (#110333) Fortran INCLUDE lines have (until now) been treated like #include directives. This isn't how things work with other Fortran compilers when running under the -E option for preprocessing only, so stop doing it by default, and add -fpreprocess-include-lines to turn it back on when desired. --- clang/include/clang/Driver/Options.td | 2 ++ flang/include/flang/Frontend/PreprocessorOptions.h | 3 +++ flang/include/flang/Parser/parsing.h | 1 + flang/lib/Frontend/CompilerInvocation.cpp | 6 ++++++ flang/lib/Frontend/FrontendAction.cpp | 4 ++++ flang/lib/Parser/parsing.cpp | 2 ++ flang/lib/Parser/prescan.cpp | 3 +++ flang/lib/Parser/prescan.h | 5 +++++ flang/test/Parser/include.f | 2 +- 9 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index aedc4c16d4e9d..607ff47a857b8 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6898,6 +6898,8 @@ def module_suffix : Separate<["-"], "module-suffix">, Group, MetaVarNa HelpText<"Use as the suffix for module files (the default value is `.mod`)">; def fno_reformat : Flag<["-"], "fno-reformat">, Group, HelpText<"Dump the cooked character stream in -E mode">; +def fpreprocess_include_lines : Flag<["-"], "fpreprocess-include-lines">, Group, + HelpText<"Treat INCLUDE lines like #include directives in -E mode">; defm analyzed_objects_for_unparse : OptOutFC1FFlag<"analyzed-objects-for-unparse", "", "Do not use the analyzed objects when unparsing">; def emit_fir : Flag<["-"], "emit-fir">, Group, diff --git a/flang/include/flang/Frontend/PreprocessorOptions.h b/flang/include/flang/Frontend/PreprocessorOptions.h index 13a91ee9a184f..2de9dabb1b372 100644 --- a/flang/include/flang/Frontend/PreprocessorOptions.h +++ b/flang/include/flang/Frontend/PreprocessorOptions.h @@ -56,6 +56,9 @@ struct PreprocessorOptions { // -fno-reformat: Emit cooked character stream as -E output bool noReformat{false}; + // -fpreprocess-include-lines: Treat INCLUDE as #include for -E output + bool preprocessIncludeLines{false}; + // -dM: Show macro definitions with -dM -E bool showMacros{false}; diff --git a/flang/include/flang/Parser/parsing.h b/flang/include/flang/Parser/parsing.h index 4d329c189cb80..0c774decb16d3 100644 --- a/flang/include/flang/Parser/parsing.h +++ b/flang/include/flang/Parser/parsing.h @@ -40,6 +40,7 @@ struct Options { bool needProvenanceRangeToCharBlockMappings{false}; Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8}; bool prescanAndReformat{false}; // -E + bool expandIncludeLinesInPreprocessedOutput{true}; bool showColors{false}; }; diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index d68534d5509fe..2154b9ab2fbf4 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -820,6 +820,8 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, : PPMacrosFlag::Exclude; opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat); + opts.preprocessIncludeLines = + args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines); opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P); opts.showMacros = args.hasArg(clang::driver::options::OPT_dM); } @@ -1486,6 +1488,10 @@ void CompilerInvocation::setFortranOpts() { } fortranOptions.fixedFormColumns = frontendOptions.fixedFormColumns; + // -E + fortranOptions.prescanAndReformat = + frontendOptions.programAction == PrintPreprocessedInput; + fortranOptions.features = frontendOptions.features; fortranOptions.encoding = frontendOptions.encoding; diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp index 42a614fe46be5..041182bdf6178 100644 --- a/flang/lib/Frontend/FrontendAction.cpp +++ b/flang/lib/Frontend/FrontendAction.cpp @@ -95,6 +95,10 @@ bool FrontendAction::beginSourceFile(CompilerInstance &ci, getCurrentInput().getIsCUDAFortran()); } + // -fpreprocess-include-lines + invoc.getFortranOpts().expandIncludeLinesInPreprocessedOutput = + invoc.getPreprocessorOpts().preprocessIncludeLines; + // Decide between fixed and free form (if the user didn't express any // preference, use the file extension to decide) if (invoc.getFrontendOpts().fortranForm == FortranForm::Unknown) { diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp index 37dc113436aa0..d8448e4c527ac 100644 --- a/flang/lib/Parser/parsing.cpp +++ b/flang/lib/Parser/parsing.cpp @@ -75,6 +75,8 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) { messages_, *currentCooked_, preprocessor_, options.features}; prescanner.set_fixedForm(options.isFixedForm) .set_fixedFormColumnLimit(options.fixedFormColumns) + .set_expandIncludeLines(!options.prescanAndReformat || + options.expandIncludeLinesInPreprocessedOutput) .AddCompilerDirectiveSentinel("dir$"); if (options.features.IsEnabled(LanguageFeature::OpenACC)) { prescanner.AddCompilerDirectiveSentinel("$acc"); diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index b231c5859cf87..eabfcc244001a 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -1031,6 +1031,9 @@ const char *Prescanner::IsFreeFormComment(const char *p) const { } std::optional Prescanner::IsIncludeLine(const char *start) const { + if (!expandIncludeLines_) { + return std::nullopt; + } const char *p{SkipWhiteSpace(start)}; if (*p == '0' && inFixedForm_ && p == start + 5) { // Accept " 0INCLUDE" in fixed form. diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h index 9d4f7c0c302a1..c50bf231e3c70 100644 --- a/flang/lib/Parser/prescan.h +++ b/flang/lib/Parser/prescan.h @@ -48,6 +48,10 @@ class Prescanner { Preprocessor &preprocessor() { return preprocessor_; } common::LanguageFeatureControl &features() { return features_; } + Prescanner &set_expandIncludeLines(bool yes) { + expandIncludeLines_ = yes; + return *this; + } Prescanner &set_fixedForm(bool yes) { inFixedForm_ = yes; return *this; @@ -209,6 +213,7 @@ class Prescanner { Preprocessor &preprocessor_; AllSources &allSources_; common::LanguageFeatureControl features_; + bool expandIncludeLines_{true}; bool isNestedInIncludeDirective_{false}; bool backslashFreeFormContinuation_{false}; bool inFixedForm_{false}; diff --git a/flang/test/Parser/include.f b/flang/test/Parser/include.f index 8a7fe3a2ecd9d..6e16afd92ad0c 100644 --- a/flang/test/Parser/include.f +++ b/flang/test/Parser/include.f @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -E -I %S/Inputs %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -E -fpreprocess-include-lines -I %S/Inputs %s 2>&1 | FileCheck %s include 'include-file' include "include-file" include 1_'include-file' From f8ba021e64a0e76c8750a3666da68a59508afd97 Mon Sep 17 00:00:00 2001 From: Dimple Prajapati Date: Mon, 30 Sep 2024 12:39:13 -0700 Subject: [PATCH 021/151] [mlir][spirv] Add gpu printf op lowering to spirv.CL.printf op (#78510) This change contains following: - adds lowering of printf op to spirv.CL.printf op in GPUToSPIRV pass. - Fixes Constant decoration parsing for spirv GlobalVariable. - minor modification to spirv.CL.printf op assembly format. --------- Co-authored-by: Jakub Kuderski --- .../mlir/Dialect/SPIRV/IR/SPIRVCLOps.td | 4 +- mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp | 130 +++++++++++++++++- .../SPIRV/Deserialization/Deserializer.cpp | 1 + .../Target/SPIRV/Serialization/Serializer.cpp | 1 + mlir/test/Conversion/GPUToSPIRV/printf.mlir | 71 ++++++++++ mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir | 6 +- 6 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Conversion/GPUToSPIRV/printf.mlir diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td index c7c2fe8bc742c..5d086325fa5b1 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td @@ -875,7 +875,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> { #### Example: ```mlir - %0 = spirv.CL.printf %0 %1 %2 : (!spirv.ptr, (i32, i32)) -> i32 + %0 = spirv.CL.printf %fmt %1, %2 : !spirv.ptr, i32, i32 -> i32 ``` }]; @@ -889,7 +889,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> { ); let assemblyFormat = [{ - $format `,` $arguments attr-dict `:` `(` type($format) `,` `(` type($arguments) `)` `)` `->` type($result) + $format ( $arguments^ )? attr-dict `:` type($format) ( `,` type($arguments)^ )? `->` type($result) }]; let hasVerifier = 0; diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp index a8ff9247e796a..53b4c720ae56d 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp @@ -121,6 +121,15 @@ class GPUShuffleConversion final : public OpConversionPattern { ConversionPatternRewriter &rewriter) const override; }; +class GPUPrintfConversion final : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace //===----------------------------------------------------------------------===// @@ -597,6 +606,124 @@ class GPUSubgroupReduceConversion final } }; +// Formulate a unique variable/constant name after +// searching in the module for existing variable/constant names. +// This is to avoid name collision with existing variables. +// Example: printfMsg0, printfMsg1, printfMsg2, ... +static std::string makeVarName(spirv::ModuleOp moduleOp, llvm::Twine prefix) { + std::string name; + unsigned number = 0; + + do { + name.clear(); + name = (prefix + llvm::Twine(number++)).str(); + } while (moduleOp.lookupSymbol(name)); + + return name; +} + +/// Pattern to convert a gpu.printf op into a SPIR-V CLPrintf op. + +LogicalResult GPUPrintfConversion::matchAndRewrite( + gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + + Location loc = gpuPrintfOp.getLoc(); + + auto moduleOp = gpuPrintfOp->getParentOfType(); + if (!moduleOp) + return failure(); + + // SPIR-V global variable is used to initialize printf + // format string value, if there are multiple printf messages, + // each global var needs to be created with a unique name. + std::string globalVarName = makeVarName(moduleOp, llvm::Twine("printfMsg")); + spirv::GlobalVariableOp globalVar; + + IntegerType i8Type = rewriter.getI8Type(); + IntegerType i32Type = rewriter.getI32Type(); + + // Each character of printf format string is + // stored as a spec constant. We need to create + // unique name for this spec constant like + // @printfMsg0_sc0, @printfMsg0_sc1, ... by searching in the module + // for existing spec constant names. + auto createSpecConstant = [&](unsigned value) { + auto attr = rewriter.getI8IntegerAttr(value); + std::string specCstName = + makeVarName(moduleOp, llvm::Twine(globalVarName) + "_sc"); + + return rewriter.create( + loc, rewriter.getStringAttr(specCstName), attr); + }; + { + Operation *parent = + SymbolTable::getNearestSymbolTable(gpuPrintfOp->getParentOp()); + + ConversionPatternRewriter::InsertionGuard guard(rewriter); + + Block &entryBlock = *parent->getRegion(0).begin(); + rewriter.setInsertionPointToStart( + &entryBlock); // insertion point at module level + + // Create Constituents with SpecConstant by scanning format string + // Each character of format string is stored as a spec constant + // and then these spec constants are used to create a + // SpecConstantCompositeOp. + llvm::SmallString<20> formatString(adaptor.getFormat()); + formatString.push_back('\0'); // Null terminate for C. + SmallVector constituents; + for (char c : formatString) { + spirv::SpecConstantOp cSpecConstantOp = createSpecConstant(c); + constituents.push_back(SymbolRefAttr::get(cSpecConstantOp)); + } + + // Create SpecConstantCompositeOp to initialize the global variable + size_t contentSize = constituents.size(); + auto globalType = spirv::ArrayType::get(i8Type, contentSize); + spirv::SpecConstantCompositeOp specCstComposite; + // There will be one SpecConstantCompositeOp per printf message/global var, + // so no need do lookup for existing ones. + std::string specCstCompositeName = + (llvm::Twine(globalVarName) + "_scc").str(); + + specCstComposite = rewriter.create( + loc, TypeAttr::get(globalType), + rewriter.getStringAttr(specCstCompositeName), + rewriter.getArrayAttr(constituents)); + + auto ptrType = spirv::PointerType::get( + globalType, spirv::StorageClass::UniformConstant); + + // Define a GlobalVarOp initialized using specialized constants + // that is used to specify the printf format string + // to be passed to the SPIRV CLPrintfOp. + globalVar = rewriter.create( + loc, ptrType, globalVarName, FlatSymbolRefAttr::get(specCstComposite)); + + globalVar->setAttr("Constant", rewriter.getUnitAttr()); + } + // Get SSA value of Global variable and create pointer to i8 to point to + // the format string. + Value globalPtr = rewriter.create(loc, globalVar); + Value fmtStr = rewriter.create( + loc, + spirv::PointerType::get(i8Type, spirv::StorageClass::UniformConstant), + globalPtr); + + // Get printf arguments. + auto printfArgs = llvm::to_vector_of(adaptor.getArgs()); + + rewriter.create(loc, i32Type, fmtStr, printfArgs); + + // Need to erase the gpu.printf op as gpu.printf does not use result vs + // spirv::CLPrintfOp has i32 resultType so cannot replace with new SPIR-V + // printf op. + rewriter.eraseOp(gpuPrintfOp); + + return success(); +} + //===----------------------------------------------------------------------===// // GPU To SPIRV Patterns. //===----------------------------------------------------------------------===// @@ -620,5 +747,6 @@ void mlir::populateGPUToSPIRVPatterns(SPIRVTypeConverter &typeConverter, SingleDimLaunchConfigConversion, WorkGroupSizeConversion, GPUAllReduceConversion, - GPUSubgroupReduceConversion>(typeConverter, patterns.getContext()); + GPUSubgroupReduceConversion, GPUPrintfConversion>(typeConverter, + patterns.getContext()); } diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp index 38293f7106a05..6c7fe41069824 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -319,6 +319,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { case spirv::Decoration::Restrict: case spirv::Decoration::RestrictPointer: case spirv::Decoration::NoContraction: + case spirv::Decoration::Constant: if (words.size() != 2) { return emitError(unknownLoc, "OpDecoration with ") << decorationName << "needs a single target "; diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp index 7719eb68b2c2e..f355982e9ed88 100644 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp @@ -286,6 +286,7 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID, case spirv::Decoration::Restrict: case spirv::Decoration::RestrictPointer: case spirv::Decoration::NoContraction: + case spirv::Decoration::Constant: // For unit attributes and decoration attributes, the args list // has no values so we do nothing. if (isa(attr)) diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir new file mode 100644 index 0000000000000..bc091124ea4c6 --- /dev/null +++ b/mlir/test/Conversion/GPUToSPIRV/printf.mlir @@ -0,0 +1,71 @@ +// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -convert-gpu-to-spirv -verify-diagnostics %s | FileCheck %s + +module attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env<#spirv.vce, #spirv.resource_limits<>> +} { + func.func @main() { + %c1 = arith.constant 1 : index + + gpu.launch_func @kernels::@printf + blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) + args() + return + } + + gpu.module @kernels { + // CHECK: spirv.module @{{.*}} Physical32 OpenCL + // CHECK-DAG: spirv.SpecConstant [[SPECCST:@.*]] = {{.*}} : i8 + // CHECK-DAG: spirv.SpecConstantComposite [[SPECCSTCOMPOSITE:@.*]] ([[SPECCST]], {{.*}}) : !spirv.array<[[ARRAYSIZE:.*]] x i8> + // CHECK-DAG: spirv.GlobalVariable [[PRINTMSG:@.*]] initializer([[SPECCSTCOMPOSITE]]) {Constant} : !spirv.ptr, UniformConstant> + gpu.func @printf() kernel + attributes + {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr, UniformConstant> + // CHECK-NEXT: [[FMTSTR_PTR:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr, UniformConstant> to !spirv.ptr + // CHECK-NEXT {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] : !spirv.ptr -> i32 + gpu.printf "\nHello\n" + // CHECK: spirv.Return + gpu.return + } + } +} + +// ----- + +module attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env<#spirv.vce, #spirv.resource_limits<>> +} { + func.func @main() { + %c1 = arith.constant 1 : index + %c100 = arith.constant 100: i32 + %cst_f32 = arith.constant 314.4: f32 + + gpu.launch_func @kernels1::@printf_args + blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) + args(%c100: i32, %cst_f32: f32) + return + } + + gpu.module @kernels1 { + // CHECK: spirv.module @{{.*}} Physical32 OpenCL { + // CHECK-DAG: spirv.SpecConstant [[SPECCST:@.*]] = {{.*}} : i8 + // CHECK-DAG: spirv.SpecConstantComposite [[SPECCSTCOMPOSITE:@.*]] ([[SPECCST]], {{.*}}) : !spirv.array<[[ARRAYSIZE:.*]] x i8> + // CHECK-DAG: spirv.GlobalVariable [[PRINTMSG:@.*]] initializer([[SPECCSTCOMPOSITE]]) {Constant} : !spirv.ptr, UniformConstant> + gpu.func @printf_args(%arg0: i32, %arg1: f32) kernel + attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.thread_id x + + // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr, UniformConstant> + // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr, UniformConstant> to !spirv.ptr + // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr, i32, f32, i32 -> i32 + gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index + + // CHECK: spirv.Return + gpu.return + } + } +} diff --git a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir index 81ba471d3f51e..8f021ed3d663d 100644 --- a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir @@ -274,9 +274,9 @@ func.func @rintvec(%arg0 : vector<3xf16>) -> () { // spirv.CL.printf //===----------------------------------------------------------------------===// // CHECK-LABEL: func.func @printf( -func.func @printf(%arg0 : !spirv.ptr, %arg1 : i32, %arg2 : i32) -> i32 { - // CHECK: spirv.CL.printf {{%.*}}, {{%.*}}, {{%.*}} : (!spirv.ptr, (i32, i32)) -> i32 - %0 = spirv.CL.printf %arg0, %arg1, %arg2 : (!spirv.ptr, (i32, i32)) -> i32 +func.func @printf(%fmt : !spirv.ptr, %arg1 : i32, %arg2 : i32) -> i32 { + // CHECK: spirv.CL.printf {{%.*}} {{%.*}}, {{%.*}} : !spirv.ptr, i32, i32 -> i32 + %0 = spirv.CL.printf %fmt %arg1, %arg2 : !spirv.ptr, i32, i32 -> i32 return %0 : i32 } From 53943de73aa8fa7a9497028100e987a3b73ac339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Mon, 30 Sep 2024 22:12:06 +0200 Subject: [PATCH 022/151] [GlobalISel] Import extract/insert subvector (#110287) Test: AArch64/GlobalISel/irtranslator-subvector.ll Reference: https://llvm.org/docs/LangRef.html#llvm-vector-extract-intrinsic https://llvm.org/docs/LangRef.html#llvm-vector-insert-intrinsic --- .../llvm/CodeGen/GlobalISel/IRTranslator.h | 2 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 103 +++++ .../GlobalISel/irtranslator-subvector.ll | 352 ++++++++++++++++++ 3 files changed, 457 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 2796ea4a86617..6fd05c8fddd5f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -546,8 +546,10 @@ class IRTranslator : public MachineFunctionPass { bool translateVAArg(const User &U, MachineIRBuilder &MIRBuilder); bool translateInsertElement(const User &U, MachineIRBuilder &MIRBuilder); + bool translateInsertVector(const User &U, MachineIRBuilder &MIRBuilder); bool translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder); + bool translateExtractVector(const User &U, MachineIRBuilder &MIRBuilder); bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 7ff8d2446eec5..40360b0b0f1d8 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2588,6 +2588,10 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getOrCreateVReg(*CI.getOperand(0)), getOrCreateVReg(*CI.getOperand(1))); return true; + case Intrinsic::vector_extract: + return translateExtractVector(CI, MIRBuilder); + case Intrinsic::vector_insert: + return translateInsertVector(CI, MIRBuilder); case Intrinsic::prefetch: { Value *Addr = CI.getOperand(0); unsigned RW = cast(CI.getOperand(1))->getZExtValue(); @@ -3163,6 +3167,57 @@ bool IRTranslator::translateInsertElement(const User &U, return true; } +bool IRTranslator::translateInsertVector(const User &U, + MachineIRBuilder &MIRBuilder) { + Register Dst = getOrCreateVReg(U); + Register Vec = getOrCreateVReg(*U.getOperand(0)); + Register Elt = getOrCreateVReg(*U.getOperand(1)); + + ConstantInt *CI = cast(U.getOperand(2)); + unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits(); + + // Resize Index to preferred index width. + if (CI->getBitWidth() != PreferredVecIdxWidth) { + APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth); + CI = ConstantInt::get(CI->getContext(), NewIdx); + } + + // If it is a <1 x Ty> vector, we have to use other means. + if (auto *ResultType = dyn_cast(U.getOperand(1)->getType()); + ResultType && ResultType->getNumElements() == 1) { + if (auto *InputType = dyn_cast(U.getOperand(0)->getType()); + InputType && InputType->getNumElements() == 1) { + // We are inserting an illegal fixed vector into an illegal + // fixed vector, use the scalar as it is not a legal vector type + // in LLT. + return translateCopy(U, *U.getOperand(0), MIRBuilder); + } + if (isa(U.getOperand(0)->getType())) { + // We are inserting an illegal fixed vector into a legal fixed + // vector, use the scalar as it is not a legal vector type in + // LLT. + Register Idx = getOrCreateVReg(*CI); + MIRBuilder.buildInsertVectorElement(Dst, Vec, Elt, Idx); + return true; + } + if (isa(U.getOperand(0)->getType())) { + // We are inserting an illegal fixed vector into a scalable + // vector, use a scalar element insert. + LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth); + Register Idx = getOrCreateVReg(*CI); + auto ScaledIndex = MIRBuilder.buildMul( + VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx); + MIRBuilder.buildInsertVectorElement(Dst, Vec, Elt, ScaledIndex); + return true; + } + } + + MIRBuilder.buildInsertSubvector( + getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)), + getOrCreateVReg(*U.getOperand(1)), CI->getZExtValue()); + return true; +} + bool IRTranslator::translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder) { // If it is a <1 x Ty> vector, use the scalar as it is @@ -3191,6 +3246,54 @@ bool IRTranslator::translateExtractElement(const User &U, return true; } +bool IRTranslator::translateExtractVector(const User &U, + MachineIRBuilder &MIRBuilder) { + Register Res = getOrCreateVReg(U); + Register Vec = getOrCreateVReg(*U.getOperand(0)); + ConstantInt *CI = cast(U.getOperand(1)); + unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits(); + + // Resize Index to preferred index width. + if (CI->getBitWidth() != PreferredVecIdxWidth) { + APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth); + CI = ConstantInt::get(CI->getContext(), NewIdx); + } + + // If it is a <1 x Ty> vector, we have to use other means. + if (auto *ResultType = dyn_cast(U.getType()); + ResultType && ResultType->getNumElements() == 1) { + if (auto *InputType = dyn_cast(U.getOperand(0)->getType()); + InputType && InputType->getNumElements() == 1) { + // We are extracting an illegal fixed vector from an illegal fixed vector, + // use the scalar as it is not a legal vector type in LLT. + return translateCopy(U, *U.getOperand(0), MIRBuilder); + } + if (isa(U.getOperand(0)->getType())) { + // We are extracting an illegal fixed vector from a legal fixed + // vector, use the scalar as it is not a legal vector type in + // LLT. + Register Idx = getOrCreateVReg(*CI); + MIRBuilder.buildExtractVectorElement(Res, Vec, Idx); + return true; + } + if (isa(U.getOperand(0)->getType())) { + // We are extracting an illegal fixed vector from a scalable + // vector, use a scalar element extract. + LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth); + Register Idx = getOrCreateVReg(*CI); + auto ScaledIndex = MIRBuilder.buildMul( + VecIdxTy, MIRBuilder.buildVScale(VecIdxTy, 1), Idx); + MIRBuilder.buildExtractVectorElement(Res, Vec, ScaledIndex); + return true; + } + } + + MIRBuilder.buildExtractSubvector(getOrCreateVReg(U), + getOrCreateVReg(*U.getOperand(0)), + CI->getZExtValue()); + return true; +} + bool IRTranslator::translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder) { // A ShuffleVector that operates on scalable vectors is a splat vector where diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll new file mode 100644 index 0000000000000..149bf72b053ef --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-subvector.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve -global-isel -stop-after=irtranslator -aarch64-enable-gisel-sve=1 %s -o - | FileCheck %s + +define i32 @extract_v4i32_vector_insert_const(<4 x i32> %a, <2 x i32> %b, i32 %c) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert_const + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](<2 x s32>), 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[C]](s64) + ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0) + %d = extractelement <4 x i32> %vector, i32 1 + ret i32 %d +} + +define double @extract_v4double_vector_insert_const(<4 x double> %a, <2 x double> %b, i32 %c) { + ; CHECK-LABEL: name: extract_v4double_vector_insert_const + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s64>) = G_INSERT_SUBVECTOR [[CONCAT_VECTORS]], [[COPY2]](<2 x s64>), 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s64>), [[C]](s64) + ; CHECK-NEXT: $d0 = COPY [[EVEC]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 +entry: + %vector = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> %a, <2 x double> %b, i64 0) + %d = extractelement <4 x double> %vector, i32 1 + ret double %d +} + +define float @extract_v4float_vector_insert_const(<4 x float> %a, <2 x float> %b, i32 %c) { + ; CHECK-LABEL: name: extract_v4float_vector_insert_const + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[BITCAST]], [[COPY1]](<2 x s32>), 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[C]](s64) + ; CHECK-NEXT: $s0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 +entry: + %vector = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> %a, <2 x float> %b, i64 0) + %d = extractelement <4 x float> %vector, i32 1 + ret float %d +} + +define i32 @extract_v4i32_vector_insert(<4 x i32> %a, <2 x i32> %b, i32 %c) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](<2 x s32>), 0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[INSERT_SUBVECTOR]](<4 x s32>), [[ZEXT]](s64) + ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0) + %d = extractelement <4 x i32> %vector, i32 %c + ret i32 %d +} + +define i32 @extract_v4i32_vector_extract(<4 x i32> %a, <2 x i32> %b, i32 %c) { + ; CHECK-LABEL: name: extract_v4i32_vector_extract + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<4 x s32>) = G_EXTRACT_SUBVECTOR [[COPY]](<4 x s32>), 0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[EXTRACT_SUBVECTOR]](<4 x s32>), [[ZEXT]](s64) + ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0) + %d = extractelement <4 x i32> %vector, i32 %c + ret i32 %d +} + +define i32 @extract_v4i32_vector_extract_const( %a, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_extract_const + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $w0, $x1, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_() = G_EXTRACT_SUBVECTOR [[COPY]](), 0 + ; CHECK-NEXT: G_STORE [[EXTRACT_SUBVECTOR]](), [[COPY2]](p0) :: (store () into %ir.p) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call @llvm.vector.extract( %a, i64 0) + store %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v2double_vector_extract_const( %a, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v2double_vector_extract_const + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $w0, $x1, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_() = G_EXTRACT_SUBVECTOR [[COPY]](), 0 + ; CHECK-NEXT: G_STORE [[EXTRACT_SUBVECTOR]](), [[COPY2]](p0) :: (store () into %ir.p) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call @llvm.vector.extract( %a, i64 0) + store %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_insert_const_vscale( %a, %b, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_vscale + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $w0, $x1, $z0, $z1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_() = COPY $z1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_() = G_INSERT_SUBVECTOR [[COPY]], [[COPY1]](), 0 + ; CHECK-NEXT: G_STORE [[INSERT_SUBVECTOR]](), [[COPY3]](p0) :: (store () into %ir.p) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call @llvm.vector.insert.nxv4i32.v4i32( %a, %b, i64 0) + store %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_extract_const_illegal_fixed(<4 x i32> %a, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_extract_const_illegal_fixed + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64) + ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[COPY1]](p0) :: (store (s32) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C1]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x i32> @llvm.vector.extract(<4 x i32> %a, i64 0) + store <1 x i32> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_extract_const_illegal_scalable( %a, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_extract_const_illegal_scalable + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $x0, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 1 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[VSCALE]], [[C]] + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](), [[MUL]](s64) + ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[COPY1]](p0) :: (store (s32) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C1]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x i32> @llvm.vector.extract( %a, i64 0) + store <1 x i32> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_insert_const_illegal_scalable( %a, <1 x i32> %b, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_illegal_scalable + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $w0, $x1, $z0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_() = COPY $z0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 1 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[VSCALE]], [[C]] + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_() = G_INSERT_VECTOR_ELT [[COPY]], [[UV]](s32), [[MUL]](s64) + ; CHECK-NEXT: G_STORE [[IVEC]](), [[COPY3]](p0) :: (store () into %ir.p) + ; CHECK-NEXT: $w0 = COPY [[C1]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call @llvm.vector.insert.nxv4i32.v4i32( %a, <1 x i32> %b, i64 0) + store %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_insert_const_fixed(<4 x i32> %a, <1 x i32> %b, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_fixed + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d1, $q0, $w0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[UV]](s32), [[C]](s64) + ; CHECK-NEXT: G_STORE [[IVEC]](<4 x s32>), [[COPY3]](p0) :: (store (<4 x s32>) into %ir.p) + ; CHECK-NEXT: $w0 = COPY [[C1]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %a, <1 x i32> %b, i64 0) + store <4 x i32> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4i32_vector_insert_const_fixed_illegal(<1 x i32> %a, <1 x i32> %b, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4i32_vector_insert_const_fixed_illegal + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $w0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY3]](p0) :: (store (s32) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x i32> @llvm.vector.insert.v1i32.v4i32(<1 x i32> %a, <1 x i32> %b, i64 0) + store <1 x i32> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4ptr_vector_insert_const_fixed_illegal(<1 x ptr> %a, <1 x ptr> %b, i32 %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4ptr_vector_insert_const_fixed_illegal + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $w0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK-NEXT: G_STORE [[COPY4]](p0), [[COPY3]](p0) :: (store (p0) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x ptr> @llvm.vector.insert.v1ptr.v4ptr(<1 x ptr> %a, <1 x ptr> %b, i64 0) + store <1 x ptr> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4ifloat_vector_insert_const_fixed_illegal(<1 x float> %a, <1 x float> %b, float %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4ifloat_vector_insert_const_fixed_illegal + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $s2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $s2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY3]](p0) :: (store (s32) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x float> @llvm.vector.insert.v1f32.v4f32(<1 x float> %a, <1 x float> %b, i64 0) + store <1 x float> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4iptr_vector_insert_const_fixed_illegal(<1 x ptr> %a, <1 x ptr> %b, ptr %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4iptr_vector_insert_const_fixed_illegal + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1, $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $d1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK-NEXT: G_STORE [[COPY4]](p0), [[COPY3]](p0) :: (store (p0) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <1 x ptr> @llvm.vector.insert.v1ptr.v4ptr(<1 x ptr> %a, <1 x ptr> %b, i64 0) + store <1 x ptr> %vector, ptr %p, align 16 + ret i32 1 +} + +define i32 @extract_v4iptr_vector_insert_const_fixed_legal(<4 x ptr> %a, <4 x ptr> %b, ptr %c, ptr %p) { + ; CHECK-LABEL: name: extract_v4iptr_vector_insert_const_fixed_legal + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2, $q3, $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY2]](<2 x s64>), [[COPY3]](<2 x s64>) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<4 x p0>) = G_INSERT_SUBVECTOR [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]](<4 x p0>), 0 + ; CHECK-NEXT: G_STORE [[INSERT_SUBVECTOR]](<4 x p0>), [[COPY5]](p0) :: (store (<4 x p0>) into %ir.p, align 16) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %vector = call <4 x ptr> @llvm.vector.insert.v4ptr.v4ptr(<4 x ptr> %a, <4 x ptr> %b, i64 0) + store <4 x ptr> %vector, ptr %p, align 16 + ret i32 1 +} From 6c5277baf558c0f3f17043b1adbed54679191779 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Mon, 30 Sep 2024 20:26:54 +0000 Subject: [PATCH 023/151] [X86] Decode VPTERNLOG truth tables when disassembling Alongside something like: vpternlogq zmm0, zmm2, zmm1, 64 We will now have a comment on the right like: # zmm0 = zmm0 & zmm2 & ~zmm1 This makes it easy to tell at a glance what sort of truth table the instruction will provide. --- .../X86/MCTargetDesc/X86InstComments.cpp | 92 +++ .../CodeGen/X86/avx512-gfni-intrinsics.ll | 74 ++ .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 10 + llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 7 + .../X86/avx512vl-intrinsics-upgrade.ll | 20 + llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 20 + .../CodeGen/X86/sse-intrinsics-fast-isel.ll | 1 + .../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 2 + .../CodeGen/X86/stack-folding-int-avx512.ll | 18 +- .../vector-interleaved-load-i16-stride-5.ll | 92 +-- .../vector-interleaved-load-i16-stride-6.ll | 244 ++++--- .../vector-interleaved-load-i16-stride-7.ll | 158 ++-- .../vector-interleaved-load-i8-stride-6.ll | 492 ++++++------- .../vector-interleaved-store-i16-stride-5.ll | 308 ++++---- .../vector-interleaved-store-i16-stride-6.ll | 123 ++-- .../vector-interleaved-store-i16-stride-7.ll | 672 +++++++++--------- .../vector-interleaved-store-i8-stride-8.ll | 58 +- 17 files changed, 1370 insertions(+), 1021 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index a4b72515252a0..534717a4bea4e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -39,6 +39,11 @@ using namespace llvm; CASE_MASK_INS_COMMON(Inst, Suffix, src) \ CASE_MASKZ_INS_COMMON(Inst, Suffix, src) +#define CASE_PTERNLOG(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, r##src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z128, r##src##i) + #define CASE_MOVDUP(Inst, src) \ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ @@ -617,6 +622,90 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS, return true; } +static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS, + const MCInstrInfo &MCII) { + unsigned NumOperands = MI->getNumOperands(); + + int Src2Idx; + int Src3Idx; + switch (MI->getOpcode()) { + // dest, src1, src2, src3, tbl + // dest, src1, mask, src2, src3, tbl + CASE_PTERNLOG(PTERNLOGD, r) + CASE_PTERNLOG(PTERNLOGQ, r) + Src2Idx = NumOperands - 3; + Src3Idx = NumOperands - 2; + break; + + // dest, src1, src2, memory, tbl + // dest, src1, mask, src2, memory, tbl + CASE_PTERNLOG(PTERNLOGD, m) + CASE_PTERNLOG(PTERNLOGQ, m) + CASE_PTERNLOG(PTERNLOGD, mb) + CASE_PTERNLOG(PTERNLOGQ, mb) + Src2Idx = NumOperands - 7; + Src3Idx = -1; + break; + + default: + return false; + } + const char *DestName = getRegName(MI->getOperand(0).getReg()); + const char *Src1Name = getRegName(MI->getOperand(1).getReg()); + const char *Src2Name = getRegName(MI->getOperand(Src2Idx).getReg()); + const char *Src3Name = + Src3Idx != -1 ? getRegName(MI->getOperand(Src3Idx).getReg()) : "mem"; + uint8_t TruthTable = MI->getOperand(NumOperands - 1).getImm(); + + OS << DestName; + printMasking(OS, MI, MCII); + OS << " = "; + + constexpr unsigned kNumVariables = 3; + constexpr unsigned kNumTruthTableEntries = 1 << kNumVariables; + int NumMinterms = llvm::popcount(TruthTable); + if (NumMinterms == 0) { + OS << '0'; + } else if (NumMinterms == kNumTruthTableEntries) { + OS << "-1"; + } else { + while (TruthTable != 0) { + // Index of the lowest bit set. + unsigned I = llvm::countr_zero(TruthTable); + // Clear the lowest bit set. + TruthTable &= TruthTable - 1; + // Our index tells us which sources are and are not complemented. Note + // that the indexing goes left-to-right. + bool Src1 = I & 0b100; + bool Src2 = I & 0b010; + bool Src3 = I & 0b001; + + // Group in parenthesis to make the output more obvious but only if there + // are multiple terms. + if (NumMinterms > 1) + OS << '('; + + if (!Src1) + OS << '~'; + OS << Src1Name << " & "; + if (!Src2) + OS << '~'; + OS << Src2Name << " & "; + if (!Src3) + OS << '~'; + OS << Src3Name; + + if (NumMinterms > 1) + OS << ')'; + + // Output an OR if there is another term in the table. + if (TruthTable != 0) + OS << " | "; + } + } + OS << '\n'; + return true; +} //===----------------------------------------------------------------------===// // Top Level Entrypoint @@ -636,6 +725,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, if (printFMAComments(MI, OS, MCII)) return true; + if (printPTERNLOGComments(MI, OS, MCII)) + return true; + switch (MI->getOpcode()) { default: // Not an instruction for which we can decode comments. diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index bafa33ff9a1c8..432d27ac04eda 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -33,9 +33,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] @@ -47,9 +49,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] @@ -95,12 +99,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -113,12 +120,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> @@ -166,18 +176,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: # zmm6 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineinvqb_512: @@ -195,18 +210,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: # zmm6 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0) ; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> @@ -250,9 +270,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] @@ -264,9 +286,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] @@ -312,12 +336,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -330,12 +357,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> @@ -383,18 +413,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: # zmm6 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineqb_512: @@ -412,18 +447,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: # zmm6 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0) ; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> @@ -468,8 +508,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca] +; X86NOBW-NEXT: # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2) ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -478,8 +520,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 ; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca] +; X64NOBW-NEXT: # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2) ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> @@ -506,6 +550,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -516,6 +561,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1 ; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -557,11 +603,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X86NOBW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca] +; X86NOBW-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8mulb_256_mask: @@ -571,11 +620,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 ; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] ; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X64NOBW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca] +; X64NOBW-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2) ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) @@ -602,8 +654,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X86NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] @@ -616,8 +670,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3 ; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] ; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X64NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] @@ -661,17 +717,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] ; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X86NOBW-NEXT: # zmm4 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] ; X86NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] ; X86NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X86NOBW-NEXT: # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8mulb_512_mask: @@ -687,17 +748,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 ; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] ; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X64NOBW-NEXT: # zmm4 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] ; X64NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] ; X64NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X64NOBW-NEXT: # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2) ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) @@ -726,13 +792,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] ; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] @@ -752,13 +822,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6 ; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] ; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 6c9c28bc9e55e..9b94a9cf45ddf 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -7682,6 +7682,7 @@ define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) ret <16 x i32> %res @@ -7692,12 +7693,14 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) ret <16 x i32> %res @@ -7710,12 +7713,14 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) ret <16 x i32> %res @@ -7727,6 +7732,7 @@ define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) ret <8 x i64> %res @@ -7738,12 +7744,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) ret <8 x i64> %res @@ -7757,12 +7765,14 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 86ebb1e40870f..832e55a835525 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -301,6 +301,7 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x00] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpsrld $31, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0x1f] ; AVX512-NEXT: retq ## encoding: [0xc3] ; @@ -520,6 +521,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; AVX512-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x37,0xcb] ; AVX512-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] ; @@ -544,6 +546,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> ; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6d,0x48,0x66,0xcb] ; KNL-NEXT: kxorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x47,0xc9] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; KNL-NEXT: retq ## encoding: [0xc3] @@ -1233,6 +1236,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1264,6 +1268,7 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ## encoding: [0xc4,0xe3,0x6d,0x4c,0xc1,0x00] ; KNL-NEXT: retq ## encoding: [0xc3] @@ -1292,6 +1297,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1408,6 +1414,7 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32> ; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf6,0x48,0x27,0xc9] ; AVX512-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xfa,0xc0] ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index c0bb0037923dc..519f19740ab25 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -12346,6 +12346,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) ret <4 x i32> %res @@ -12357,12 +12358,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) ret <4 x i32> %res @@ -12376,12 +12379,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) ret <4 x i32> %res @@ -12393,6 +12398,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) ret <8 x i32> %res @@ -12404,12 +12410,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) ret <8 x i32> %res @@ -12423,12 +12431,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) ret <8 x i32> %res @@ -12440,6 +12450,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) ret <2 x i64> %res @@ -12451,12 +12462,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) ret <2 x i64> %res @@ -12470,12 +12483,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) ret <2 x i64> %res @@ -12487,6 +12502,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) ret <4 x i64> %res @@ -12498,12 +12514,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) ret <4 x i64> %res @@ -12517,12 +12535,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index f1c70378b1eb3..6c7a5d2f86341 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4150,6 +4150,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) ret <4 x i32> %1 @@ -4161,12 +4162,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4183,12 +4186,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4203,6 +4208,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) ret <8 x i32> %1 @@ -4214,12 +4220,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4235,12 +4243,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4254,6 +4264,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) ret <2 x i64> %1 @@ -4265,12 +4276,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4285,12 +4298,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4305,6 +4320,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) ret <4 x i64> %1 @@ -4316,12 +4332,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4336,12 +4354,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index f2e48c7f308e5..86b8121f21cff 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -96,6 +96,7 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind ; AVX512-LABEL: test_mm_andnot_ps: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index adf4fc28208e7..853bb6367fe4d 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -288,6 +288,7 @@ define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounw ; AVX512-LABEL: test_mm_andnot_pd: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> @@ -316,6 +317,7 @@ define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-LABEL: test_mm_andnot_si128: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %not = xor <2 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index fb3d57e595307..e8b0facf534b0 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -442,7 +442,7 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -461,7 +461,7 @@ define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2210,7 +2210,7 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2286,7 +2286,7 @@ define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2307,7 +2307,7 @@ define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask) ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2347,7 +2347,7 @@ define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2368,7 +2368,7 @@ define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i6 ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = -1 ; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6091,7 +6091,7 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -7047,6 +7047,7 @@ define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32 ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) @@ -7062,6 +7063,7 @@ define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index b18f08b62f0d4..fd9ba68d5707a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -3926,7 +3926,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm15, %zmm18, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15) ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 @@ -3959,7 +3959,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] ; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -4007,7 +4007,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm14 @@ -4032,7 +4032,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] @@ -4117,7 +4117,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -4133,7 +4133,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] @@ -4177,7 +4177,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 @@ -4201,7 +4201,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 @@ -4298,7 +4298,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm15, %zmm18, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14 @@ -4331,7 +4331,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -4379,7 +4379,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm14 @@ -4404,7 +4404,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] @@ -4489,7 +4489,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -4505,7 +4505,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] @@ -4549,7 +4549,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 @@ -4573,7 +4573,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 @@ -8060,7 +8060,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -8118,7 +8118,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm19, %zmm27, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] @@ -8146,6 +8146,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 @@ -8177,6 +8178,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -8204,7 +8206,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq $184, %zmm28, %zmm27, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -8229,7 +8231,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2) ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm8 ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] @@ -8293,7 +8295,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] @@ -8326,7 +8328,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem) ; AVX512-NEXT: vmovdqa %ymm9, %ymm2 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] @@ -8494,7 +8496,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8536,7 +8538,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] @@ -8581,7 +8583,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm10, %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8621,7 +8623,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] @@ -8666,7 +8668,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] @@ -8700,7 +8702,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -8769,7 +8771,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] @@ -8794,7 +8796,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] @@ -8992,7 +8994,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -9050,7 +9052,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm19, %zmm27, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] @@ -9078,6 +9080,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 @@ -9109,6 +9112,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -9136,7 +9140,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm28, %zmm27, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -9161,7 +9165,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2) ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8 ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] @@ -9225,7 +9229,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] @@ -9258,7 +9262,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] @@ -9426,7 +9430,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9468,7 +9472,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] @@ -9513,7 +9517,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm10, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9553,7 +9557,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] @@ -9598,7 +9602,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] @@ -9632,7 +9636,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -9701,7 +9705,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] @@ -9726,7 +9730,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 605deed6536bf..a37b8e33ceffe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -2497,7 +2497,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] @@ -2512,7 +2512,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2608,7 +2608,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -2623,7 +2623,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -2721,7 +2721,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] @@ -2736,7 +2736,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2832,7 +2832,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -2847,7 +2847,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -5147,9 +5147,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm0 & zmm17) | (zmm2 & ~zmm0 & zmm17) | (zmm2 & zmm0 & ~zmm17) | (zmm2 & zmm0 & zmm17) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm2) | (zmm16 & ~zmm17 & ~zmm2) | (zmm16 & ~zmm17 & zmm2) | (zmm16 & zmm17 & zmm2) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] @@ -5178,9 +5178,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm2) | (zmm5 & ~zmm0 & zmm2) | (zmm5 & zmm0 & ~zmm2) | (zmm5 & zmm0 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm5) | (zmm20 & ~zmm17 & ~zmm5) | (zmm20 & ~zmm17 & zmm5) | (zmm20 & zmm17 & zmm5) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 @@ -5211,7 +5211,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm10 & ~ymm11) | (~ymm8 & ymm10 & ymm11) | (ymm8 & ~ymm10 & ymm11) | (ymm8 & ymm10 & ~ymm11) | (ymm8 & ymm10 & ymm11) ; AVX512-NEXT: movw $31, %ax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} @@ -5248,7 +5248,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] @@ -5261,16 +5261,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm30, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = (~zmm15 & ~zmm3 & mem) | (zmm15 & ~zmm3 & mem) | (zmm15 & zmm3 & ~mem) | (zmm15 & zmm3 & mem) ; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm10 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm17 & zmm2) | (zmm10 & ~zmm17 & ~zmm2) | (zmm10 & ~zmm17 & zmm2) | (zmm10 & zmm17 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0) ; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) @@ -5426,9 +5428,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm0 & zmm17) | (zmm11 & ~zmm0 & zmm17) | (zmm11 & zmm0 & ~zmm17) | (zmm11 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm11) | (zmm16 & ~zmm17 & ~zmm11) | (zmm16 & ~zmm17 & zmm11) | (zmm16 & zmm17 & zmm11) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] @@ -5454,9 +5456,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm4) | (zmm20 & ~zmm17 & ~zmm4) | (zmm20 & ~zmm17 & zmm4) | (zmm20 & zmm17 & zmm4) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 @@ -5486,7 +5488,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm10 & ~ymm11) | (~ymm5 & ymm10 & ymm11) | (ymm5 & ~ymm10 & ymm11) | (ymm5 & ymm10 & ~ymm11) | (ymm5 & ymm10 & ymm11) ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} @@ -5509,7 +5511,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm7 & ymm11) | (ymm0 & ~ymm7 & ~ymm11) | (ymm0 & ~ymm7 & ymm11) | (ymm0 & ymm7 & ~ymm11) | (ymm0 & ymm7 & ymm11) ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] @@ -5530,17 +5532,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm2) | (zmm8 & ~zmm17 & ~zmm2) | (zmm8 & ~zmm17 & zmm2) | (zmm8 & zmm17 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & zmm1) | (zmm0 & ~zmm17 & ~zmm1) | (zmm0 & ~zmm17 & zmm1) | (zmm0 & zmm17 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) @@ -5612,7 +5616,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16) ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -5635,7 +5639,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm3) | (zmm2 & ~zmm17 & zmm3) | (zmm2 & zmm17 & ~zmm3) | (zmm2 & zmm17 & zmm3) ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -5702,9 +5706,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] @@ -5731,9 +5735,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm4) | (zmm16 & ~zmm17 & ~zmm4) | (zmm16 & ~zmm17 & zmm4) | (zmm16 & zmm17 & zmm4) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 @@ -5763,7 +5767,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ~ymm14) | (~ymm6 & ymm8 & ymm14) | (ymm6 & ~ymm8 & ymm14) | (ymm6 & ymm8 & ~ymm14) | (ymm6 & ymm8 & ymm14) ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1} @@ -5800,7 +5804,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm14) | (~ymm2 & ymm3 & ymm14) | (ymm2 & ~ymm3 & ymm14) | (ymm2 & ymm3 & ~ymm14) | (ymm2 & ymm3 & ymm14) ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] @@ -5815,8 +5819,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm8 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm4) | (zmm8 & ~zmm17 & ~zmm4) | (zmm8 & ~zmm17 & zmm4) | (zmm8 & zmm17 & zmm4) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) @@ -5888,7 +5892,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16) ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -5911,7 +5915,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm5) | (zmm2 & ~zmm17 & zmm5) | (zmm2 & zmm17 & ~zmm5) | (zmm2 & zmm17 & zmm5) ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -5975,9 +5979,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] @@ -6001,9 +6005,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm0 & zmm7) | (zmm3 & ~zmm0 & zmm7) | (zmm3 & zmm0 & ~zmm7) | (zmm3 & zmm0 & zmm7) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm3) | (zmm16 & ~zmm17 & ~zmm3) | (zmm16 & ~zmm17 & zmm3) | (zmm16 & zmm17 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 @@ -6032,7 +6036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm13, %ymm11, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm11 & ~ymm13) | (~ymm5 & ymm11 & ymm13) | (ymm5 & ~ymm11 & ymm13) | (ymm5 & ymm11 & ~ymm13) | (ymm5 & ymm11 & ymm13) ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1} @@ -6056,7 +6060,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm13, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm8 & ymm13) | (ymm2 & ~ymm8 & ~ymm13) | (ymm2 & ~ymm8 & ymm13) | (ymm2 & ymm8 & ~ymm13) | (ymm2 & ymm8 & ymm13) ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] @@ -6079,8 +6083,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm17 & zmm0) | (zmm7 & ~zmm17 & ~zmm0) | (zmm7 & ~zmm17 & zmm0) | (zmm7 & zmm17 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm17 & zmm1) | (zmm2 & ~zmm17 & ~zmm1) | (zmm2 & ~zmm17 & zmm1) | (zmm2 & zmm17 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) @@ -10680,9 +10684,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm2) | (zmm1 & ~zmm29 & zmm2) | (zmm1 & zmm29 & ~zmm2) | (zmm1 & zmm29 & zmm2) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10743,8 +10747,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm23) | (zmm2 & ~zmm29 & zmm23) | (zmm2 & zmm29 & ~zmm23) | (zmm2 & zmm29 & zmm23) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm22 & zmm2) | (zmm0 & ~zmm22 & ~zmm2) | (zmm0 & ~zmm22 & zmm2) | (zmm0 & zmm22 & zmm2) ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] @@ -10790,8 +10794,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 -; AVX512-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm28) | (zmm0 & ~zmm29 & zmm28) | (zmm0 & zmm29 & ~zmm28) | (zmm0 & zmm29 & zmm28) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm22 & zmm0) | (zmm25 & ~zmm22 & ~zmm0) | (zmm25 & ~zmm22 & zmm0) | (zmm25 & zmm22 & zmm0) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] @@ -10819,8 +10823,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm0) | (zmm2 & ~zmm29 & zmm0) | (zmm2 & zmm29 & ~zmm0) | (zmm2 & zmm29 & zmm0) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm22 & zmm2) | (zmm28 & ~zmm22 & ~zmm2) | (zmm28 & ~zmm22 & zmm2) | (zmm28 & zmm22 & zmm2) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -10859,7 +10863,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm4 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm29) | (~ymm3 & ymm4 & ymm29) | (ymm3 & ~ymm4 & ymm29) | (ymm3 & ymm4 & ~ymm29) | (ymm3 & ymm4 & ymm29) ; AVX512-NEXT: movw $31, %ax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} @@ -10896,7 +10900,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm0 & ~ymm29) | (~ymm4 & ymm0 & ymm29) | (ymm4 & ~ymm0 & ymm29) | (ymm4 & ymm0 & ~ymm29) | (ymm4 & ymm0 & ymm29) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX512-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -10954,7 +10958,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm13 & ~ymm29) | (~ymm3 & ymm13 & ymm29) | (ymm3 & ~ymm13 & ymm29) | (ymm3 & ymm13 & ~ymm29) | (ymm3 & ymm13 & ymm29) ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3 @@ -10973,7 +10977,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm3 & ~ymm29) | (~ymm10 & ymm3 & ymm29) | (ymm10 & ~ymm3 & ymm29) | (ymm10 & ymm3 & ~ymm29) | (ymm10 & ymm3 & ymm29) ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] @@ -10999,6 +11003,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -11006,24 +11011,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 -; AVX512-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm6 & zmm30) | (zmm3 & ~zmm6 & ~zmm30) | (zmm3 & ~zmm6 & zmm30) | (zmm3 & zmm6 & zmm30) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm6 & zmm31) | (zmm5 & ~zmm6 & ~zmm31) | (zmm5 & ~zmm6 & zmm31) | (zmm5 & zmm6 & zmm31) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & zmm2) | (zmm0 & ~zmm6 & ~zmm2) | (zmm0 & ~zmm6 & zmm2) | (zmm0 & zmm6 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm1) | (zmm4 & ~zmm6 & ~zmm1) | (zmm4 & ~zmm6 & zmm1) | (zmm4 & zmm6 & zmm1) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11306,9 +11314,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm29 & zmm3) | (zmm4 & ~zmm29 & zmm3) | (zmm4 & zmm29 & ~zmm3) | (zmm4 & zmm29 & zmm3) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm26 & zmm4) | (zmm5 & ~zmm26 & ~zmm4) | (zmm5 & ~zmm26 & zmm4) | (zmm5 & zmm26 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11364,8 +11372,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm11) | (zmm1 & ~zmm29 & zmm11) | (zmm1 & zmm29 & ~zmm11) | (zmm1 & zmm29 & zmm11) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm26 & zmm1) | (zmm25 & ~zmm26 & ~zmm1) | (zmm25 & ~zmm26 & zmm1) | (zmm25 & zmm26 & zmm1) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -11404,8 +11412,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm23 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm24) | (zmm0 & ~zmm29 & zmm24) | (zmm0 & zmm29 & ~zmm24) | (zmm0 & zmm29 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm26 & zmm0) | (zmm23 & ~zmm26 & ~zmm0) | (zmm23 & ~zmm26 & zmm0) | (zmm23 & zmm26 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload @@ -11439,8 +11447,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm26, %zmm28 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm29 & zmm0) | (zmm5 & ~zmm29 & zmm0) | (zmm5 & zmm29 & ~zmm0) | (zmm5 & zmm29 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm26 & zmm5) | (zmm28 & ~zmm26 & ~zmm5) | (zmm28 & ~zmm26 & zmm5) | (zmm28 & zmm26 & zmm5) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -11477,7 +11485,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm29) | (~ymm2 & ymm3 & ymm29) | (ymm2 & ~ymm3 & ymm29) | (ymm2 & ymm3 & ~ymm29) | (ymm2 & ymm3 & ymm29) ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} @@ -11512,7 +11520,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~ymm29) | (~ymm3 & ymm0 & ymm29) | (ymm3 & ~ymm0 & ymm29) | (ymm3 & ymm0 & ~ymm29) | (ymm3 & ymm0 & ymm29) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -11555,7 +11563,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm10 & ~ymm29) | (~ymm14 & ymm10 & ymm29) | (ymm14 & ~ymm10 & ymm29) | (ymm14 & ymm10 & ~ymm29) | (ymm14 & ymm10 & ymm29) ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11 @@ -11584,7 +11592,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm14, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm14 & ~ymm29) | (~ymm6 & ymm14 & ymm29) | (ymm6 & ~ymm14 & ymm29) | (ymm6 & ymm14 & ~ymm29) | (ymm6 & ymm14 & ymm29) ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] @@ -11607,6 +11615,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload @@ -11614,23 +11623,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpternlogq $184, %zmm30, %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm31, %zmm26, %zmm7 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm11 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm26 & zmm30) | (zmm4 & ~zmm26 & ~zmm30) | (zmm4 & ~zmm26 & zmm30) | (zmm4 & zmm26 & zmm30) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm26 & zmm31) | (zmm7 & ~zmm26 & ~zmm31) | (zmm7 & ~zmm26 & zmm31) | (zmm7 & zmm26 & zmm31) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm26 & zmm0) | (zmm11 & ~zmm26 & ~zmm0) | (zmm11 & ~zmm26 & zmm0) | (zmm11 & zmm26 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm26 & zmm2) | (zmm1 & ~zmm26 & ~zmm2) | (zmm1 & ~zmm26 & zmm2) | (zmm1 & zmm26 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) @@ -11719,7 +11731,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} @@ -11784,7 +11796,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm16 & zmm17) | (zmm7 & ~zmm16 & zmm17) | (zmm7 & zmm16 & ~zmm17) | (zmm7 & zmm16 & zmm17) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] @@ -11812,7 +11824,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0 @@ -11844,7 +11856,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11915,9 +11927,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm21, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm2) | (zmm1 & ~zmm21 & zmm2) | (zmm1 & zmm21 & ~zmm2) | (zmm1 & zmm21 & zmm2) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -11978,8 +11990,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm19) | (zmm2 & ~zmm21 & zmm19) | (zmm2 & zmm21 & ~zmm19) | (zmm2 & zmm21 & zmm19) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm18 & zmm2) | (zmm28 & ~zmm18 & ~zmm2) | (zmm28 & ~zmm18 & zmm2) | (zmm28 & zmm18 & zmm2) ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 @@ -12016,8 +12028,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm21 & zmm20) | (zmm0 & ~zmm21 & zmm20) | (zmm0 & zmm21 & ~zmm20) | (zmm0 & zmm21 & zmm20) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm18 & zmm0) | (zmm27 & ~zmm18 & ~zmm0) | (zmm27 & ~zmm18 & zmm0) | (zmm27 & zmm18 & zmm0) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] @@ -12045,8 +12057,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm0) | (zmm2 & ~zmm21 & zmm0) | (zmm2 & zmm21 & ~zmm0) | (zmm2 & zmm21 & zmm0) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm2) | (zmm20 & ~zmm18 & ~zmm2) | (zmm20 & ~zmm18 & zmm2) | (zmm20 & zmm18 & zmm2) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12084,7 +12096,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm4 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm4 & ~ymm22) | (~ymm2 & ymm4 & ymm22) | (ymm2 & ~ymm4 & ymm22) | (ymm2 & ymm4 & ~ymm22) | (ymm2 & ymm4 & ymm22) ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} @@ -12120,7 +12132,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~ymm22) | (~ymm2 & ymm0 & ymm22) | (ymm2 & ~ymm0 & ymm22) | (ymm2 & ymm0 & ~ymm22) | (ymm2 & ymm0 & ymm22) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12178,7 +12190,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm13, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm13 & ~ymm22) | (~ymm9 & ymm13 & ymm22) | (ymm9 & ~ymm13 & ymm22) | (ymm9 & ymm13 & ~ymm22) | (ymm9 & ymm13 & ymm22) ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm9 @@ -12197,7 +12209,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm9, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm9 & ~ymm22) | (~ymm11 & ymm9 & ymm22) | (ymm11 & ~ymm9 & ymm22) | (ymm11 & ymm9 & ~ymm22) | (ymm11 & ymm9 & ymm22) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] @@ -12226,10 +12238,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 -; AVX512DQ-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 -; AVX512DQ-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm3 & zmm23) | (zmm24 & ~zmm3 & ~zmm23) | (zmm24 & ~zmm3 & zmm23) | (zmm24 & zmm3 & zmm23) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm3 & zmm25) | (zmm21 & ~zmm3 & ~zmm25) | (zmm21 & ~zmm3 & zmm25) | (zmm21 & zmm3 & zmm25) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & zmm26) | (zmm0 & ~zmm3 & ~zmm26) | (zmm0 & ~zmm3 & zmm26) | (zmm0 & zmm3 & zmm26) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm3 & zmm1) | (zmm2 & ~zmm3 & ~zmm1) | (zmm2 & ~zmm3 & zmm1) | (zmm2 & zmm3 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm1, (%rcx) @@ -12320,7 +12332,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} @@ -12385,7 +12397,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm17) | (zmm6 & ~zmm16 & zmm17) | (zmm6 & zmm16 & ~zmm17) | (zmm6 & zmm16 & zmm17) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] @@ -12413,7 +12425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 @@ -12445,7 +12457,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12516,9 +12528,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm20 & zmm3) | (zmm4 & ~zmm20 & zmm3) | (zmm4 & zmm20 & ~zmm3) | (zmm4 & zmm20 & zmm3) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm28, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm28 & zmm4) | (zmm5 & ~zmm28 & ~zmm4) | (zmm5 & ~zmm28 & zmm4) | (zmm5 & zmm28 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -12575,8 +12587,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm27 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm9) | (zmm1 & ~zmm20 & zmm9) | (zmm1 & zmm20 & ~zmm9) | (zmm1 & zmm20 & zmm9) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm28 & zmm1) | (zmm27 & ~zmm28 & ~zmm1) | (zmm27 & ~zmm28 & zmm1) | (zmm27 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 @@ -12614,8 +12626,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm19 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm21) | (zmm1 & ~zmm20 & zmm21) | (zmm1 & zmm20 & ~zmm21) | (zmm1 & zmm20 & zmm21) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm28 & zmm1) | (zmm19 & ~zmm28 & ~zmm1) | (zmm19 & ~zmm28 & zmm1) | (zmm19 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -12641,8 +12653,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm0) | (zmm1 & ~zmm20 & zmm0) | (zmm1 & zmm20 & ~zmm0) | (zmm1 & zmm20 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm28 & zmm1) | (zmm21 & ~zmm28 & ~zmm1) | (zmm21 & ~zmm28 & zmm1) | (zmm21 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12678,7 +12690,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm3 & ~ymm20) | (~ymm0 & ymm3 & ymm20) | (ymm0 & ~ymm3 & ymm20) | (ymm0 & ymm3 & ~ymm20) | (ymm0 & ymm3 & ymm20) ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} @@ -12715,7 +12727,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ~ymm20) | (~ymm3 & ymm2 & ymm20) | (ymm3 & ~ymm2 & ymm20) | (ymm3 & ymm2 & ~ymm20) | (ymm3 & ymm2 & ymm20) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -12757,7 +12769,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm14 & ~ymm20) | (~ymm10 & ymm14 & ymm20) | (ymm10 & ~ymm14 & ymm20) | (ymm10 & ymm14 & ~ymm20) | (ymm10 & ymm14 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] @@ -12788,7 +12800,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm15 & ~ymm20) | (~ymm5 & ymm15 & ymm20) | (ymm5 & ~ymm15 & ymm20) | (ymm5 & ymm15 & ~ymm20) | (ymm5 & ymm15 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7] @@ -12813,10 +12825,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm28, %zmm23 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm28, %zmm13 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm28 & zmm22) | (zmm23 & ~zmm28 & ~zmm22) | (zmm23 & ~zmm28 & zmm22) | (zmm23 & zmm28 & zmm22) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm28 & zmm24) | (zmm13 & ~zmm28 & ~zmm24) | (zmm13 & ~zmm28 & zmm24) | (zmm13 & zmm28 & zmm24) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm28 & zmm0) | (zmm10 & ~zmm28 & ~zmm0) | (zmm10 & ~zmm28 & zmm0) | (zmm10 & zmm28 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm28 & zmm2) | (zmm1 & ~zmm28 & ~zmm2) | (zmm1 & ~zmm28 & zmm2) | (zmm1 & zmm28 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index af340d15fe8f6..9c2fb7704d1d4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -6979,7 +6979,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7] ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~mem) | (ymm3 & ~ymm0 & ~mem) | (ymm3 & ~ymm0 & mem) | (ymm3 & ymm0 & ~mem) | (ymm3 & ymm0 & mem) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -7128,18 +7128,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm4, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm4 & zmm21) | (zmm11 & ~zmm4 & ~zmm21) | (zmm11 & ~zmm4 & zmm21) | (zmm11 & zmm4 & zmm21) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & ~mem) | (zmm22 & ~zmm5 & mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm4 & zmm22) | (zmm12 & ~zmm4 & ~zmm22) | (zmm12 & ~zmm4 & zmm22) | (zmm12 & zmm4 & zmm22) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm19, %zmm4, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm4 & zmm19) | (zmm23 & ~zmm4 & ~zmm19) | (zmm23 & ~zmm4 & zmm19) | (zmm23 & zmm4 & zmm19) ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm23 {%k1} -; AVX512-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm27 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm4 & zmm20) | (zmm27 & ~zmm4 & ~zmm20) | (zmm27 & ~zmm4 & zmm20) | (zmm27 & zmm4 & zmm20) ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} -; AVX512-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm4 & zmm18) | (zmm1 & ~zmm4 & zmm18) | (zmm1 & zmm4 & ~zmm18) | (zmm1 & zmm4 & zmm18) ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx) @@ -7147,11 +7147,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm4, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm4 & zmm17) | (zmm2 & ~zmm4 & zmm17) | (zmm2 & zmm4 & ~zmm17) | (zmm2 & zmm4 & zmm17) ; AVX512-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $226, %zmm16, %zmm4, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm4 & zmm16) | (zmm3 & ~zmm4 & zmm16) | (zmm3 & zmm4 & ~zmm16) | (zmm3 & zmm4 & zmm16) ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper @@ -7344,7 +7344,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm30 & zmm0) | (zmm16 & ~zmm30 & ~zmm0) | (zmm16 & ~zmm30 & zmm0) | (zmm16 & zmm30 & zmm0) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] @@ -7429,17 +7429,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm26 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm24 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm24, %zmm3, %zmm27 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm3 & zmm23) | (zmm26 & ~zmm3 & ~zmm23) | (zmm26 & ~zmm3 & zmm23) | (zmm26 & zmm3 & zmm23) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm20 & ~mem) | (zmm24 & ~zmm20 & mem) | (zmm24 & zmm20 & ~mem) | (zmm24 & zmm20 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & zmm24) | (zmm27 & ~zmm3 & ~zmm24) | (zmm27 & ~zmm3 & zmm24) | (zmm27 & zmm3 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm30 & zmm21) | (zmm15 & ~zmm30 & ~zmm21) | (zmm15 & ~zmm30 & zmm21) | (zmm15 & zmm30 & zmm21) ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq $184, %zmm18, %zmm30, %zmm19 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm30 & zmm18) | (zmm19 & ~zmm30 & ~zmm18) | (zmm19 & ~zmm30 & zmm18) | (zmm19 & zmm30 & zmm18) ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm30 & zmm25) | (zmm11 & ~zmm30 & zmm25) | (zmm11 & zmm30 & ~zmm25) | (zmm11 & zmm30 & zmm25) ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx) @@ -7448,7 +7448,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vpternlogq $226, %zmm10, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm30 & zmm10) | (zmm0 & ~zmm30 & zmm10) | (zmm0 & zmm30 & ~zmm10) | (zmm0 & zmm30 & zmm10) ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -7639,7 +7639,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~mem) | (ymm2 & ~ymm0 & ~mem) | (ymm2 & ~ymm0 & mem) | (ymm2 & ymm0 & ~mem) | (ymm2 & ymm0 & mem) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -7649,7 +7649,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm25 & zmm19) | (zmm22 & ~zmm25 & ~zmm19) | (zmm22 & ~zmm25 & zmm19) | (zmm22 & zmm25 & zmm19) ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1} @@ -7682,7 +7682,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm25 & zmm17) | (zmm19 & ~zmm25 & ~zmm17) | (zmm19 & ~zmm25 & zmm17) | (zmm19 & zmm25 & zmm17) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] @@ -7714,7 +7714,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm25 & zmm20) | (zmm11 & ~zmm25 & zmm20) | (zmm11 & zmm25 & ~zmm20) | (zmm11 & zmm25 & zmm20) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -7768,7 +7768,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm24, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm25 & zmm24) | (zmm2 & ~zmm25 & zmm24) | (zmm2 & zmm25 & ~zmm24) | (zmm2 & zmm25 & zmm24) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] @@ -7799,13 +7799,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm27, %zmm25, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm25 & zmm27) | (zmm3 & ~zmm25 & zmm27) | (zmm3 & zmm25 & ~zmm27) | (zmm3 & zmm25 & zmm27) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm1 & zmm18) | (zmm8 & ~zmm1 & ~zmm18) | (zmm8 & ~zmm1 & zmm18) | (zmm8 & zmm1 & zmm18) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 -; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm0 & ~mem) | (zmm21 & ~zmm0 & mem) | (zmm21 & zmm0 & ~mem) | (zmm21 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm21) | (zmm9 & ~zmm1 & ~zmm21) | (zmm9 & ~zmm1 & zmm21) | (zmm9 & zmm1 & zmm21) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rcx) @@ -7939,7 +7939,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm19, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm19 & zmm9) | (zmm16 & ~zmm19 & ~zmm9) | (zmm16 & ~zmm19 & zmm9) | (zmm16 & zmm19 & zmm9) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -7977,7 +7977,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm2) | (zmm20 & ~zmm19 & ~zmm2) | (zmm20 & ~zmm19 & zmm2) | (zmm20 & zmm19 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -8002,7 +8002,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm19, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm19 & zmm12) | (zmm10 & ~zmm19 & ~zmm12) | (zmm10 & ~zmm19 & zmm12) | (zmm10 & zmm19 & zmm12) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] @@ -8065,7 +8065,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm19 & zmm12) | (zmm11 & ~zmm19 & zmm12) | (zmm11 & zmm19 & ~zmm12) | (zmm11 & zmm19 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9 @@ -8094,12 +8094,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm2) | (zmm0 & ~zmm19 & zmm2) | (zmm0 & zmm19 & ~zmm2) | (zmm0 & zmm19 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm23) | (zmm9 & ~zmm1 & ~zmm23) | (zmm9 & ~zmm1 & zmm23) | (zmm9 & zmm1 & zmm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm21 & ~mem) | (zmm24 & ~zmm21 & mem) | (zmm24 & zmm21 & ~mem) | (zmm24 & zmm21 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm24) | (zmm5 & ~zmm1 & ~zmm24) | (zmm5 & ~zmm1 & zmm24) | (zmm5 & zmm1 & zmm24) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) @@ -14527,7 +14527,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm20 & ymm0) | (ymm11 & ~ymm20 & ~ymm0) | (ymm11 & ~ymm20 & ymm0) | (ymm11 & ymm20 & ~ymm0) | (ymm11 & ymm20 & ymm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] ; AVX512-NEXT: vmovdqa %ymm15, %ymm13 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm25 @@ -14546,7 +14546,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7] ; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm0) | (ymm1 & ~ymm20 & ~ymm0) | (ymm1 & ~ymm20 & ymm0) | (ymm1 & ymm20 & ~ymm0) | (ymm1 & ymm20 & ymm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -14700,7 +14700,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm0) | (zmm3 & ~zmm19 & ~zmm0) | (zmm3 & ~zmm19 & zmm0) | (zmm3 & zmm19 & zmm0) ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX512-NEXT: vmovdqa %ymm4, %ymm14 @@ -14812,7 +14812,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm30 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = (~zmm30 & zmm19 & zmm0) | (zmm30 & ~zmm19 & ~zmm0) | (zmm30 & ~zmm19 & zmm0) | (zmm30 & zmm19 & zmm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] @@ -14868,7 +14868,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm6, %zmm19, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm19 & zmm6) | (zmm13 & ~zmm19 & ~zmm6) | (zmm13 & ~zmm19 & zmm6) | (zmm13 & zmm19 & zmm6) ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] @@ -14887,13 +14887,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = (~zmm10 & ~zmm9 & mem) | (zmm10 & ~zmm9 & mem) | (zmm10 & zmm9 & ~mem) | (zmm10 & zmm9 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & mem) ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm4 +; AVX512-NEXT: # zmm3 = (~zmm3 & zmm9 & mem) | (zmm3 & ~zmm9 & ~mem) | (zmm3 & ~zmm9 & mem) | (zmm3 & zmm9 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm9 & zmm10) | (zmm2 & ~zmm9 & ~zmm10) | (zmm2 & ~zmm9 & zmm10) | (zmm2 & zmm9 & zmm10) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm9 & zmm12) | (zmm4 & ~zmm9 & ~zmm12) | (zmm4 & ~zmm9 & zmm12) | (zmm4 & zmm9 & zmm12) ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] ; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] @@ -14912,21 +14916,26 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = (~zmm12 & zmm19 & mem) | (zmm12 & ~zmm19 & ~mem) | (zmm12 & ~zmm19 & mem) | (zmm12 & zmm19 & mem) ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = (~zmm25 & zmm19 & mem) | (zmm25 & ~zmm19 & ~mem) | (zmm25 & ~zmm19 & mem) | (zmm25 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & ~zmm19 & mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & ~mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rsi) @@ -14941,11 +14950,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (~zmm1 & ~zmm19 & mem) | (zmm1 & ~zmm19 & mem) | (zmm1 & zmm19 & ~mem) | (zmm1 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = (~zmm11 & ~zmm19 & mem) | (zmm11 & ~zmm19 & mem) | (zmm11 & zmm19 & ~mem) | (zmm11 & zmm19 & mem) ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} @@ -15433,7 +15444,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm27 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm29 & zmm1) | (zmm27 & ~zmm29 & ~zmm1) | (zmm27 & ~zmm29 & zmm1) | (zmm27 & zmm29 & zmm1) ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] ; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12 @@ -15469,7 +15480,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm26 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm29 & zmm1) | (zmm26 & ~zmm29 & ~zmm1) | (zmm26 & ~zmm29 & zmm1) | (zmm26 & zmm29 & zmm1) ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 @@ -15619,19 +15630,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm13 = (~zmm13 & ~zmm9 & mem) | (zmm13 & ~zmm9 & mem) | (zmm13 & zmm9 & ~mem) | (zmm13 & zmm9 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm17 = (~zmm17 & zmm9 & mem) | (zmm17 & ~zmm9 & ~mem) | (zmm17 & ~zmm9 & mem) | (zmm17 & zmm9 & mem) ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm11 +; AVX512-FCP-NEXT: # zmm21 = (~zmm21 & zmm9 & mem) | (zmm21 & ~zmm9 & ~mem) | (zmm21 & ~zmm9 & mem) | (zmm21 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & zmm12) | (zmm5 & ~zmm9 & ~zmm12) | (zmm5 & ~zmm9 & zmm12) | (zmm5 & zmm9 & zmm12) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm13) | (zmm11 & ~zmm9 & ~zmm13) | (zmm11 & ~zmm9 & zmm13) | (zmm11 & zmm9 & zmm13) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = (~zmm19 & zmm29 & mem) | (zmm19 & ~zmm29 & ~mem) | (zmm19 & ~zmm29 & mem) | (zmm19 & zmm29 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload @@ -15647,18 +15663,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm29 & mem) | (zmm2 & ~zmm29 & ~mem) | (zmm2 & ~zmm29 & mem) | (zmm2 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = (~zmm8 & zmm29 & mem) | (zmm8 & ~zmm29 & ~mem) | (zmm8 & ~zmm29 & mem) | (zmm8 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = (~zmm6 & zmm29 & mem) | (zmm6 & ~zmm29 & ~mem) | (zmm6 & ~zmm29 & mem) | (zmm6 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) @@ -15674,13 +15694,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm29 & mem) | (zmm4 & ~zmm29 & mem) | (zmm4 & zmm29 & ~mem) | (zmm4 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm29 & mem) | (zmm0 & ~zmm29 & mem) | (zmm0 & zmm29 & ~mem) | (zmm0 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708 @@ -16152,7 +16175,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm19 & ymm0) | (ymm10 & ~ymm19 & ~ymm0) | (ymm10 & ~ymm19 & ymm0) | (ymm10 & ymm19 & ~ymm0) | (ymm10 & ymm19 & ymm0) ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm0, %xmm28 @@ -16165,6 +16188,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = (~zmm10 & zmm17 & mem) | (zmm10 & ~zmm17 & ~mem) | (zmm10 & ~zmm17 & mem) | (zmm10 & zmm17 & mem) ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -16178,7 +16202,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm19 & ymm0) | (ymm11 & ~ymm19 & ~ymm0) | (ymm11 & ~ymm19 & ymm0) | (ymm11 & ymm19 & ~ymm0) | (ymm11 & ymm19 & ymm0) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -16189,6 +16213,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = (~zmm11 & zmm17 & mem) | (zmm11 & ~zmm17 & ~mem) | (zmm11 & ~zmm17 & mem) | (zmm11 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 @@ -16227,6 +16252,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm27 = (~zmm27 & zmm17 & mem) | (zmm27 & ~zmm17 & ~mem) | (zmm27 & ~zmm17 & mem) | (zmm27 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 @@ -16266,6 +16292,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 = (~zmm29 & zmm17 & mem) | (zmm29 & ~zmm17 & ~mem) | (zmm29 & ~zmm17 & mem) | (zmm29 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -16314,6 +16341,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = (~zmm7 & ~zmm17 & mem) | (zmm7 & ~zmm17 & mem) | (zmm7 & zmm17 & ~mem) | (zmm7 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload @@ -16336,7 +16364,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm14 & zmm0) | (zmm12 & ~zmm14 & ~zmm0) | (zmm12 & ~zmm14 & zmm0) | (zmm12 & zmm14 & zmm0) ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm9 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] @@ -16407,6 +16435,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = (~zmm12 & ~zmm24 & mem) | (zmm12 & ~zmm24 & mem) | (zmm12 & zmm24 & ~mem) | (zmm12 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -16445,7 +16474,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm24 & zmm1) | (zmm7 & ~zmm24 & ~zmm1) | (zmm7 & ~zmm24 & zmm1) | (zmm7 & zmm24 & zmm1) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7] @@ -16486,6 +16515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = (~zmm11 & ~zmm24 & mem) | (zmm11 & ~zmm24 & mem) | (zmm11 & zmm24 & ~mem) | (zmm11 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] @@ -16501,7 +16531,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm24 & zmm1) | (zmm6 & ~zmm24 & ~zmm1) | (zmm6 & ~zmm24 & zmm1) | (zmm6 & zmm24 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 @@ -16527,13 +16557,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (~zmm1 & ~zmm0 & mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = (~zmm4 & ~zmm0 & mem) | (zmm4 & ~zmm0 & mem) | (zmm4 & zmm0 & ~mem) | (zmm4 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm28 = (~zmm28 & zmm0 & mem) | (zmm28 & ~zmm0 & ~mem) | (zmm28 & ~zmm0 & mem) | (zmm28 & zmm0 & mem) ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 +; AVX512DQ-NEXT: # zmm23 = (~zmm23 & zmm0 & mem) | (zmm23 & ~zmm0 & ~mem) | (zmm23 & ~zmm0 & mem) | (zmm23 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & zmm1) | (zmm2 & ~zmm0 & ~zmm1) | (zmm2 & ~zmm0 & zmm1) | (zmm2 & zmm0 & zmm1) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm0 & zmm4) | (zmm3 & ~zmm0 & ~zmm4) | (zmm3 & ~zmm0 & zmm4) | (zmm3 & zmm0 & zmm4) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) @@ -16946,6 +16980,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & zmm25 & mem) | (zmm6 & ~zmm25 & ~mem) | (zmm6 & ~zmm25 & mem) | (zmm6 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} @@ -16963,6 +16998,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & zmm25 & mem) | (zmm1 & ~zmm25 & ~mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm0 @@ -17000,6 +17036,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm23 = (~zmm23 & zmm25 & mem) | (zmm23 & ~zmm25 & ~mem) | (zmm23 & ~zmm25 & mem) | (zmm23 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm1 @@ -17022,6 +17059,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = (~zmm29 & zmm25 & mem) | (zmm29 & ~zmm25 & ~mem) | (zmm29 & ~zmm25 & mem) | (zmm29 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -17040,7 +17078,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm28 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm25 & zmm0) | (zmm28 & ~zmm25 & ~zmm0) | (zmm28 & ~zmm25 & zmm0) | (zmm28 & zmm25 & zmm0) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 @@ -17077,7 +17115,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm25 & zmm0) | (zmm26 & ~zmm25 & ~zmm0) | (zmm26 & ~zmm25 & zmm0) | (zmm26 & zmm25 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -17132,6 +17170,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm25 & mem) | (zmm0 & ~zmm25 & mem) | (zmm0 & zmm25 & ~mem) | (zmm0 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] @@ -17160,6 +17199,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm9 = (~zmm9 & ~zmm25 & mem) | (zmm9 & ~zmm25 & mem) | (zmm9 & zmm25 & ~mem) | (zmm9 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 @@ -17196,6 +17236,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm12 = (~zmm12 & ~zmm25 & mem) | (zmm12 & ~zmm25 & mem) | (zmm12 & zmm25 & ~mem) | (zmm12 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -17227,17 +17268,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm25 & mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & ~mem) | (zmm1 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm2 & mem) | (zmm3 & ~zmm2 & mem) | (zmm3 & zmm2 & ~mem) | (zmm3 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm2 & mem) | (zmm4 & ~zmm2 & mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = (~zmm5 & zmm2 & mem) | (zmm5 & ~zmm2 & ~mem) | (zmm5 & ~zmm2 & mem) | (zmm5 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & zmm2 & mem) | (zmm6 & ~zmm2 & ~mem) | (zmm6 & ~zmm2 & mem) | (zmm6 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm3) | (zmm10 & ~zmm2 & ~zmm3) | (zmm10 & ~zmm2 & zmm3) | (zmm10 & zmm2 & zmm3) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm4) | (zmm8 & ~zmm2 & ~zmm4) | (zmm8 & ~zmm2 & zmm4) | (zmm8 & zmm2 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index e4dc257543d20..44684603e301d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -1979,7 +1979,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -1990,34 +1990,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2051,7 +2051,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2062,34 +2062,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2123,7 +2123,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2134,34 +2134,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2195,7 +2195,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2206,34 +2206,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -3775,7 +3775,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -3784,13 +3784,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -3803,7 +3803,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -3812,50 +3812,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -3866,7 +3866,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-NEXT: vmovdqa %ymm4, (%rcx) @@ -3885,7 +3885,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -3894,13 +3894,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -3913,7 +3913,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -3922,50 +3922,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -3976,7 +3976,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) @@ -3995,7 +3995,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -4004,13 +4004,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -4023,7 +4023,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -4032,50 +4032,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -4086,7 +4086,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) @@ -4105,7 +4105,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -4114,13 +4114,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -4133,7 +4133,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -4142,50 +4142,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -4196,7 +4196,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) @@ -7385,7 +7385,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7396,7 +7396,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7407,7 +7407,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7415,7 +7415,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7441,7 +7441,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7449,7 +7449,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7458,13 +7458,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7485,7 +7485,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -7494,25 +7494,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -7520,7 +7521,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -7528,17 +7529,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -7557,14 +7558,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -7576,24 +7577,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -7603,22 +7604,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -7638,7 +7639,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7649,7 +7650,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7660,7 +7661,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7668,7 +7669,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7694,7 +7695,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7702,7 +7703,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7711,13 +7712,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7738,7 +7739,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -7747,25 +7748,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -7773,7 +7775,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -7781,17 +7783,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -7810,14 +7812,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -7829,24 +7831,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -7856,22 +7858,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -7891,7 +7893,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7902,7 +7904,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7913,7 +7915,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7921,7 +7923,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7947,7 +7949,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7955,7 +7957,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7964,13 +7966,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512DQ-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7991,7 +7993,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -8000,25 +8002,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -8026,7 +8029,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -8034,17 +8037,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512DQ-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -8063,14 +8066,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -8082,24 +8085,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -8109,22 +8112,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -8144,7 +8147,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8155,7 +8158,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -8166,7 +8169,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -8174,7 +8177,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -8200,7 +8203,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -8208,7 +8211,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -8217,13 +8220,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -8244,7 +8247,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -8253,25 +8256,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -8279,7 +8283,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -8287,17 +8291,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -8316,14 +8320,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -8335,24 +8339,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -8362,22 +8366,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 7d2f52d3c5830..cd481a30cb211 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -444,7 +444,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] @@ -472,7 +472,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] @@ -499,7 +499,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] @@ -527,7 +527,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] @@ -936,7 +936,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -975,7 +975,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1020,7 +1020,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1059,7 +1059,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1737,7 +1737,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1748,12 +1748,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1761,7 +1761,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1770,14 +1770,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1788,9 +1788,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1815,7 +1815,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1825,12 +1825,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1838,7 +1838,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1847,14 +1847,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] @@ -1863,9 +1863,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1890,7 +1890,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1901,12 +1901,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1914,7 +1914,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1923,14 +1923,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1941,9 +1941,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512DQ-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1968,7 +1968,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1978,12 +1978,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1991,7 +1991,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -2000,14 +2000,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] @@ -2016,9 +2016,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) @@ -3383,11 +3383,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem) ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -3413,7 +3413,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem) ; AVX512-NEXT: vmovdqa (%r8), %ymm9 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3422,7 +3422,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] ; AVX512-NEXT: vpandnq %ymm10, %ymm21, %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm2, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21) ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 @@ -3441,11 +3441,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -3468,12 +3468,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4) ; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm6 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0) ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] @@ -3492,11 +3492,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0) ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 ; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) @@ -3540,7 +3540,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3550,7 +3550,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] ; AVX512-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm16, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 @@ -3574,11 +3574,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14) ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3602,13 +3602,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 @@ -3628,11 +3628,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7) ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8) ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3651,11 +3651,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4) ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -3697,11 +3697,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem) ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -3727,7 +3727,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem) ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3736,7 +3736,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] ; AVX512DQ-NEXT: vpandnq %ymm10, %ymm21, %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21) ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 @@ -3755,11 +3755,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -3782,12 +3782,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4) ; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm6 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0) ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] @@ -3806,11 +3806,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0) ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) @@ -3854,7 +3854,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3864,7 +3864,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] ; AVX512DQ-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm16, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 @@ -3888,11 +3888,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14) ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3916,13 +3916,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 @@ -3942,11 +3942,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7) ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8) ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3965,11 +3965,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4) ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -6984,20 +6984,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem) ; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 -; AVX512-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25) ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem) ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512-NEXT: # ymm18 = mem[0,1,0,1] ; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload @@ -7025,43 +7028,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm31, %zmm23 -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm24, %zmm26 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21) ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18) ; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21) ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] -; AVX512-NEXT: vpternlogq $226, %zmm21, %zmm24, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm21 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 -; AVX512-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm10 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22) ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13) ; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7) ; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $184, %zmm5, %zmm16, %zmm6 -; AVX512-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 -; AVX512-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %zmm5, %zmm9, %zmm17 -; AVX512-NEXT: vpternlogq $248, %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5) ; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%r9) @@ -7184,7 +7187,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm20 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -7194,7 +7197,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 @@ -7243,11 +7246,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2) ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 @@ -7288,11 +7291,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm31, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11) ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 ; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm15, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] @@ -7315,7 +7318,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -7329,7 +7332,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8) ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] @@ -7340,31 +7343,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4) ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm6, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5) ; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20) ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm21, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm9 +; AVX512-FCP-NEXT: # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm27 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm1, %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) @@ -7611,20 +7617,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem) ; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25) ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem) ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem) ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm18 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload @@ -7652,43 +7661,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm31, %zmm23 -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm24, %zmm26 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18) ; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm8 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-NEXT: vpternlogq $226, %zmm21, %zmm24, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm21 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 -; AVX512DQ-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm10 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQ-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13) ; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7) ; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $184, %zmm5, %zmm16, %zmm6 -; AVX512DQ-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %zmm5, %zmm9, %zmm17 -; AVX512DQ-NEXT: vpternlogq $248, %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%r9) @@ -7811,7 +7820,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -7821,7 +7830,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 @@ -7870,11 +7879,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 @@ -7915,11 +7924,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm31, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11) ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm15, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] @@ -7942,7 +7951,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -7956,7 +7965,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] @@ -7967,31 +7976,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4) ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm6, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5) ; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20) ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm21, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21) ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm1, %zmm9, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index c725dcd972cd5..c93d7b7a720c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -4128,19 +4128,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm16, %zmm9, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm9 & zmm16) | (zmm8 & ~zmm9 & ~zmm16) | (zmm8 & ~zmm9 & zmm16) | (zmm8 & zmm9 & zmm16) ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512-NEXT: vpternlogd $184, %zmm17, %zmm9, %zmm13 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm9 & zmm17) | (zmm13 & ~zmm9 & ~zmm17) | (zmm13 & ~zmm9 & zmm17) | (zmm13 & zmm9 & zmm17) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm2) | (zmm0 & ~zmm5 & ~zmm2) | (zmm0 & ~zmm5 & zmm2) | (zmm0 & zmm5 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm4) | (zmm2 & ~zmm5 & ~zmm4) | (zmm2 & ~zmm5 & zmm4) | (zmm2 & zmm5 & zmm4) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm12, %zmm5, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm12) | (zmm4 & ~zmm5 & ~zmm12) | (zmm4 & ~zmm5 & zmm12) | (zmm4 & zmm5 & zmm12) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm5, %zmm3 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm5 & zmm1) | (zmm3 & ~zmm5 & ~zmm1) | (zmm3 & ~zmm5 & zmm1) | (zmm3 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) @@ -4337,23 +4337,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & zmm1) | (zmm0 & ~zmm2 & ~zmm1) | (zmm0 & ~zmm2 & zmm1) | (zmm0 & zmm2 & zmm1) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm0) | (zmm8 & ~zmm2 & ~zmm0) | (zmm8 & ~zmm2 & zmm0) | (zmm8 & zmm2 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm21, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm0 & zmm21) | (zmm15 & ~zmm0 & ~zmm21) | (zmm15 & ~zmm0 & zmm21) | (zmm15 & zmm0 & zmm21) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512-FCP-NEXT: vpternlogd $184, %zmm19, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm0 & zmm19) | (zmm14 & ~zmm0 & ~zmm19) | (zmm14 & ~zmm0 & zmm19) | (zmm14 & zmm0 & zmm19) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm17) | (zmm1 & ~zmm0 & ~zmm17) | (zmm1 & ~zmm0 & zmm17) | (zmm1 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm16) | (zmm1 & ~zmm0 & ~zmm16) | (zmm1 & ~zmm0 & zmm16) | (zmm1 & zmm0 & zmm16) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512-FCP-NEXT: popq %rax ; AVX512-FCP-NEXT: vzeroupper @@ -4568,19 +4568,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm16, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm11 & zmm16) | (zmm10 & ~zmm11 & ~zmm16) | (zmm10 & ~zmm11 & zmm16) | (zmm10 & zmm11 & zmm16) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm12 -; AVX512DQ-NEXT: vpternlogd $184, %zmm17, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm11 & zmm17) | (zmm12 & ~zmm11 & ~zmm17) | (zmm12 & ~zmm11 & zmm17) | (zmm12 & zmm11 & zmm17) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm22, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm4 & zmm22) | (zmm0 & ~zmm4 & ~zmm22) | (zmm0 & ~zmm4 & zmm22) | (zmm0 & zmm4 & zmm22) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpternlogd $184, %zmm18, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & zmm18) | (zmm7 & ~zmm4 & ~zmm18) | (zmm7 & ~zmm4 & zmm18) | (zmm7 & zmm4 & zmm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm2) | (zmm4 & ~zmm5 & ~zmm2) | (zmm4 & ~zmm5 & zmm2) | (zmm4 & zmm5 & zmm2) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512DQ-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm5 & zmm3) | (zmm1 & ~zmm5 & ~zmm3) | (zmm1 & ~zmm5 & zmm3) | (zmm1 & zmm5 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) @@ -4787,22 +4787,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm4) | (zmm6 & ~zmm5 & ~zmm4) | (zmm6 & ~zmm5 & zmm4) | (zmm6 & zmm5 & zmm4) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm2 & zmm0) | (zmm1 & ~zmm2 & ~zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm18, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm2 & zmm18) | (zmm14 & ~zmm2 & ~zmm18) | (zmm14 & ~zmm2 & zmm18) | (zmm14 & zmm2 & zmm18) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm0) | (zmm13 & ~zmm1 & ~zmm0) | (zmm13 & ~zmm1 & zmm0) | (zmm13 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm1 & zmm0) | (zmm15 & ~zmm1 & ~zmm0) | (zmm15 & ~zmm1 & zmm0) | (zmm15 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8710,6 +8710,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = (~zmm24 & zmm25 & mem) | (zmm24 & ~zmm25 & ~mem) | (zmm24 & ~zmm25 & mem) | (zmm24 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] @@ -8722,6 +8723,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: # zmm26 = (~zmm26 & zmm25 & mem) | (zmm26 & ~zmm25 & ~mem) | (zmm26 & ~zmm25 & mem) | (zmm26 & zmm25 & mem) ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] @@ -8736,26 +8738,30 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = (~zmm17 & zmm25 & mem) | (zmm17 & ~zmm25 & ~mem) | (zmm17 & ~zmm25 & mem) | (zmm17 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm16 ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload +; AVX512-NEXT: # zmm16 = (~zmm16 & zmm25 & mem) | (zmm16 & ~zmm25 & ~mem) | (zmm16 & ~zmm25 & mem) | (zmm16 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm23, %zmm18 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = (~zmm18 & zmm23 & mem) | (zmm18 & ~zmm23 & ~mem) | (zmm18 & ~zmm23 & mem) | (zmm18 & zmm23 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 ; AVX512-NEXT: vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (~zmm4 & zmm23 & mem) | (zmm4 & ~zmm23 & ~mem) | (zmm4 & ~zmm23 & mem) | (zmm4 & zmm23 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512-NEXT: vpternlogd $184, %zmm20, %zmm23, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm23 & zmm20) | (zmm7 & ~zmm23 & ~zmm20) | (zmm7 & ~zmm23 & zmm20) | (zmm7 & zmm23 & zmm20) ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512-NEXT: vpternlogd $184, %zmm19, %zmm23, %zmm5 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm23 & zmm19) | (zmm5 & ~zmm23 & ~zmm19) | (zmm5 & ~zmm23 & zmm19) | (zmm5 & zmm23 & zmm19) ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm22, %zmm10, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm10 & zmm22) | (zmm8 & ~zmm10 & ~zmm22) | (zmm8 & ~zmm10 & zmm22) | (zmm8 & zmm10 & zmm22) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 -; AVX512-NEXT: vpternlogd $184, %zmm28, %zmm10, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm10 & zmm28) | (zmm6 & ~zmm10 & ~zmm28) | (zmm6 & ~zmm10 & zmm28) | (zmm6 & zmm10 & zmm28) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512-NEXT: vpternlogd $184, %zmm15, %zmm10, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm10 & zmm15) | (zmm0 & ~zmm10 & ~zmm15) | (zmm0 & ~zmm10 & zmm15) | (zmm0 & zmm10 & zmm15) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm10, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm1) | (zmm2 & ~zmm10 & ~zmm1) | (zmm2 & ~zmm10 & zmm1) | (zmm2 & zmm10 & zmm1) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) @@ -9166,40 +9172,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm27, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm9, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm9 & zmm8) | (zmm6 & ~zmm9 & ~zmm8) | (zmm6 & ~zmm9 & zmm8) | (zmm6 & zmm9 & zmm8) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm12 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm9 & zmm0) | (zmm12 & ~zmm9 & ~zmm0) | (zmm12 & ~zmm9 & zmm0) | (zmm12 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm0) | (zmm11 & ~zmm9 & ~zmm0) | (zmm11 & ~zmm9 & zmm0) | (zmm11 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm9 & zmm0) | (zmm3 & ~zmm9 & ~zmm0) | (zmm3 & ~zmm9 & zmm0) | (zmm3 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm17) | (zmm4 & ~zmm0 & ~zmm17) | (zmm4 & ~zmm0 & zmm17) | (zmm4 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm0 & zmm16) | (zmm5 & ~zmm0 & ~zmm16) | (zmm5 & ~zmm0 & zmm16) | (zmm5 & zmm0 & zmm16) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm25, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm25) | (zmm13 & ~zmm1 & ~zmm25) | (zmm13 & ~zmm1 & zmm25) | (zmm13 & zmm1 & zmm25) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm1 & mem) | (zmm2 & ~zmm1 & ~mem) | (zmm2 & ~zmm1 & mem) | (zmm2 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 576(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512-FCP-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX512-FCP-NEXT: vzeroupper @@ -9545,6 +9556,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm16 = (~zmm16 & zmm28 & mem) | (zmm16 & ~zmm28 & ~mem) | (zmm16 & ~zmm28 & mem) | (zmm16 & zmm28 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm1 @@ -9621,6 +9633,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (~zmm1 & zmm28 & mem) | (zmm1 & ~zmm28 & ~mem) | (zmm1 & ~zmm28 & mem) | (zmm1 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm15 @@ -9651,26 +9664,31 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = (~zmm12 & zmm28 & mem) | (zmm12 & ~zmm28 & ~mem) | (zmm12 & ~zmm28 & mem) | (zmm12 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm17 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = (~zmm17 & zmm28 & mem) | (zmm17 & ~zmm28 & ~mem) | (zmm17 & ~zmm28 & mem) | (zmm17 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm24, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm22 = (~zmm22 & zmm24 & mem) | (zmm22 & ~zmm24 & ~mem) | (zmm22 & ~zmm24 & mem) | (zmm22 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = (~zmm7 & zmm24 & mem) | (zmm7 & ~zmm24 & ~mem) | (zmm7 & ~zmm24 & mem) | (zmm7 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = (~zmm3 & zmm24 & mem) | (zmm3 & ~zmm24 & ~mem) | (zmm3 & ~zmm24 & mem) | (zmm3 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogd $184, %zmm23, %zmm24, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm24 & zmm23) | (zmm0 & ~zmm24 & ~zmm23) | (zmm0 & ~zmm24 & zmm23) | (zmm0 & zmm24 & zmm23) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm25, %zmm13, %zmm5 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm25) | (zmm5 & ~zmm13 & ~zmm25) | (zmm5 & ~zmm13 & zmm25) | (zmm5 & zmm13 & zmm25) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $184, %zmm26, %zmm13, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & zmm26) | (zmm6 & ~zmm13 & ~zmm26) | (zmm6 & ~zmm13 & zmm26) | (zmm6 & zmm13 & zmm26) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512DQ-NEXT: vpternlogd $184, %zmm27, %zmm13, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm27) | (zmm4 & ~zmm13 & ~zmm27) | (zmm4 & ~zmm13 & zmm27) | (zmm4 & zmm13 & zmm27) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogd $184, %zmm2, %zmm13, %zmm8 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm13 & zmm2) | (zmm8 & ~zmm13 & ~zmm2) | (zmm8 & ~zmm13 & zmm2) | (zmm8 & zmm13 & zmm2) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%rax) @@ -10094,42 +10112,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm8 & zmm0) | (zmm11 & ~zmm8 & ~zmm0) | (zmm11 & ~zmm8 & zmm0) | (zmm11 & zmm8 & zmm0) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm10, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm8 & zmm10) | (zmm12 & ~zmm8 & ~zmm10) | (zmm12 & ~zmm8 & zmm10) | (zmm12 & zmm8 & zmm10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm0 & zmm2) | (zmm7 & ~zmm0 & ~zmm2) | (zmm7 & ~zmm0 & zmm2) | (zmm7 & zmm0 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm27, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm0 & zmm27) | (zmm6 & ~zmm0 & ~zmm27) | (zmm6 & ~zmm0 & zmm27) | (zmm6 & zmm0 & zmm27) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm23, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm8 & zmm23) | (zmm5 & ~zmm8 & ~zmm23) | (zmm5 & ~zmm8 & zmm23) | (zmm5 & zmm8 & zmm23) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (~zmm4 & zmm8 & mem) | (zmm4 & ~zmm8 & ~mem) | (zmm4 & ~zmm8 & mem) | (zmm4 & zmm8 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 576(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & ~zmm0 & mem) | (zmm3 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $1176, %rsp # imm = 0x498 ; AVX512DQ-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index dc362d729fcd3..51f173bc1a9bc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -647,7 +647,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 48(%rax) @@ -684,7 +684,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] @@ -730,7 +730,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, 48(%rax) @@ -767,7 +767,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512DQ-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] @@ -1381,8 +1381,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastd (%r10), %ymm11 ; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem) +; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem) ; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1401,8 +1401,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1441,8 +1441,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero @@ -1463,8 +1463,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1506,8 +1506,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm11 ; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem) +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem) ; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1526,8 +1526,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1566,8 +1566,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero @@ -1588,8 +1588,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -2937,27 +2937,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem) ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem) ; AVX512-NEXT: vmovdqa %ymm0, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rcx) @@ -3065,24 +3065,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem) ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem) ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem) ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) @@ -3199,27 +3199,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem) ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm0, 192(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rcx) @@ -3327,24 +3327,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) @@ -6136,19 +6136,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm28, %zmm30, %zmm29 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28) ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm30, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1) ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] ; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6177,48 +6177,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem) +; AVX512-NEXT: vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3) ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem) ; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) @@ -6447,20 +6447,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem) ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm26, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26) ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload @@ -6477,36 +6477,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7] ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm16, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem) ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -6764,19 +6764,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm28, %zmm30, %zmm29 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm30, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1) ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] ; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6805,48 +6805,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem) +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3) ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem) ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) @@ -7075,20 +7075,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm24, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm26, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26) ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload @@ -7105,36 +7105,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm25, %zmm16, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -12769,24 +12769,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11) ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %ymm11, %ymm6, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11) ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6) ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %ymm7, %ymm8, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13024,7 +13024,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem) ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX512-NEXT: vmovdqa %xmm4, %xmm6 @@ -13042,7 +13042,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1) ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] @@ -13054,7 +13054,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm19, %zmm25 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1) ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] ; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] @@ -13071,7 +13071,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm0, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3) ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -13081,7 +13081,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm19, %zmm20 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] @@ -13156,7 +13156,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem) ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] @@ -13169,99 +13169,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3) ; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512-NEXT: # ymm3 = mem[2,1,3,2] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %ymm1, %ymm29, %ymm3 -; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm28, %ymm30 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm23, %zmm28, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm23 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem) ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload +; AVX512-NEXT: # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm30 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm11 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm25 -; AVX512-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd 64(%rax), %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm5 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1) ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 ; AVX512-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm14 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1) ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1) ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] @@ -13305,27 +13309,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogd $184, %zmm7, %zmm29, %zmm9 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7) ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512-NEXT: vpternlogd $184, %zmm3, %zmm29, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm9 -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8) ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm28, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm31 -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm13 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -13442,26 +13446,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm13, %ymm12, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13) ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm11, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm12, %ymm10, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13499,7 +13503,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -13551,7 +13555,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -13566,7 +13570,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm4, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3 @@ -13587,7 +13591,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm5, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5) ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm12 ; AVX512-FCP-NEXT: vprold $16, %ymm26, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm22 @@ -13603,8 +13607,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm5, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7] @@ -13637,7 +13641,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2) ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] @@ -13653,9 +13657,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm21, %zmm31 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 @@ -13677,7 +13681,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6) ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] @@ -13690,8 +13694,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm3, %zmm21, %zmm26 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm8, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -13722,7 +13726,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload @@ -13740,7 +13744,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] @@ -13749,8 +13753,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 @@ -13781,14 +13785,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm18, %zmm25 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 @@ -13800,11 +13804,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] @@ -13823,7 +13827,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm6, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] @@ -13839,10 +13843,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17 ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm20 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3] @@ -13860,7 +13864,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm8, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 @@ -13870,23 +13874,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm16, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] @@ -13926,25 +13932,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm6, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 +; AVX512-FCP-NEXT: # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) @@ -14097,24 +14105,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %ymm11, %ymm6, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11) ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %ymm7, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14352,7 +14360,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem) ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm6 @@ -14370,7 +14378,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] @@ -14382,7 +14390,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm19, %zmm25 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1) ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] @@ -14399,7 +14407,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -14409,7 +14417,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm19, %zmm20 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] @@ -14484,7 +14492,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] @@ -14497,99 +14505,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3) ; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = mem[2,1,3,2] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %ymm1, %ymm29, %ymm3 -; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm28, %ymm30 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm23, %zmm28, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm23 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm23 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem) ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm30 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm11 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm25 -; AVX512DQ-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd 64(%rax), %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 ; AVX512DQ-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] @@ -14633,27 +14645,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogd $184, %zmm7, %zmm29, %zmm9 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpternlogd $184, %zmm3, %zmm29, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm28, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm13 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -14770,26 +14782,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm13, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13) ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm11, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm12, %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14827,7 +14839,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -14879,7 +14891,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -14894,7 +14906,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm4, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3 @@ -14915,7 +14927,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm12 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm26, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm22 @@ -14931,8 +14943,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm5, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7] @@ -14965,7 +14977,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] @@ -14981,9 +14993,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm21, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm5, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 @@ -15005,7 +15017,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] @@ -15018,8 +15030,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm3, %zmm21, %zmm26 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -15050,7 +15062,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload @@ -15068,7 +15080,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] @@ -15077,8 +15089,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 @@ -15109,14 +15121,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm18, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 @@ -15128,11 +15140,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] @@ -15151,7 +15163,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm6, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] @@ -15167,10 +15179,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17 ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3] @@ -15188,7 +15200,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm8, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 @@ -15198,23 +15210,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm16, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] @@ -15254,25 +15268,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 311166ef60dda..47690f3c60edf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -7226,7 +7226,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm13, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload @@ -7253,7 +7253,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -7280,7 +7280,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -7314,7 +7314,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -7391,7 +7391,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm13, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -7427,7 +7427,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero @@ -7454,7 +7454,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512-NEXT: vpternlogq $184, %zmm7, %zmm13, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7) ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 @@ -7473,7 +7473,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -7664,6 +7664,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 @@ -7747,30 +7748,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem) ; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem) ; AVX512-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm20, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20) ; AVX512-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24) ; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm27, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27) ; AVX512-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm30, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30) ; AVX512-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm19, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19) ; AVX512-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1} @@ -7999,7 +8002,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload @@ -8026,7 +8029,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -8053,7 +8056,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -8087,7 +8090,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -8164,7 +8167,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq $184, %zmm12, %zmm13, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -8200,7 +8203,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero @@ -8227,7 +8230,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpternlogq $184, %zmm7, %zmm13, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 @@ -8246,7 +8249,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -8437,6 +8440,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 @@ -8520,30 +8524,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm20, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20) ; AVX512DQ-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm24, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24) ; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm27, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27) ; AVX512DQ-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm30, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30) ; AVX512DQ-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm19, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19) ; AVX512DQ-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1} From b8e79b3f5a23923dcf4a846571438d3857b3ad46 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Mon, 30 Sep 2024 16:07:30 -0500 Subject: [PATCH 024/151] [NFC][AMDGPU] Pre-commit tests for buffer contents legalization (#110559) Currently, many attempts to lower loads and stores on buffer fat pointers lower directly to intrinsic calls that will be unsupported by or crash codegen (ex, storing a [2 x i32], a <6 x half>, or an i160). Record the current behavior to make the effects of the fix more visible in an upcoming PR. --- ...ffer-fat-pointers-contents-legalization.ll | 1683 +++++++++++++++++ 1 file changed, 1683 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll new file mode 100644 index 0000000000000..5b225636b120a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll @@ -0,0 +1,1683 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target triple = "amdgcn--" + +;;; Legal types. These are natively supported, no casts should be performed. + +define i8 @load_i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i8 @load_i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i8 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i8, ptr addrspace(7) %p + ret i8 %ret +} + +define void @store_i8(i8 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i8( +; CHECK-SAME: i8 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i8 %data, ptr addrspace(7) %p + ret void +} + +define i16 @load_i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i16 @load_i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i16 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i16, ptr addrspace(7) %p + ret i16 %ret +} + +define void @store_i16(i16 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i16( +; CHECK-SAME: i16 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i16 %data, ptr addrspace(7) %p + ret void +} + +define i32 @load_i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i32 @load_i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i32 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i32, ptr addrspace(7) %p + ret i32 %ret +} + +define void @store_i32(i32 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i32( +; CHECK-SAME: i32 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i32 %data, ptr addrspace(7) %p + ret void +} + +define i64 @load_i64(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i64 @load_i64( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i64 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i64, ptr addrspace(7) %p + ret i64 %ret +} + +define void @store_i64(i64 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i64( +; CHECK-SAME: i64 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i64 %data, ptr addrspace(7) %p + ret void +} + +define i128 @load_i128(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i128 @load_i128( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i128 @llvm.amdgcn.raw.ptr.buffer.load.i128(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i128 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i128, ptr addrspace(7) %p + ret i128 %ret +} + +define void @store_i128(i128 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i128( +; CHECK-SAME: i128 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i128(i128 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i128 %data, ptr addrspace(7) %p + ret void +} + +define <1 x i32> @load_v1i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <1 x i32> @load_v1i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <1 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <1 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <1 x i32>, ptr addrspace(7) %p + ret <1 x i32> %ret +} + +define void @store_v1i32(<1 x i32> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v1i32( +; CHECK-SAME: <1 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i32(<1 x i32> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <1 x i32> %data, ptr addrspace(7) %p + ret void +} + +define <2 x i32> @load_v2i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i32> @load_v2i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i32>, ptr addrspace(7) %p + ret <2 x i32> %ret +} + +define void @store_v2i32(<2 x i32> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i32( +; CHECK-SAME: <2 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i32> %data, ptr addrspace(7) %p + ret void +} + +define <3 x i32> @load_v3i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <3 x i32> @load_v3i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <3 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <3 x i32>, ptr addrspace(7) %p + ret <3 x i32> %ret +} + +define void @store_v3i32(<3 x i32> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v3i32( +; CHECK-SAME: <3 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <3 x i32> %data, ptr addrspace(7) %p + ret void +} + +define <4 x i32> @load_v4i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x i32> @load_v4i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x i32>, ptr addrspace(7) %p + ret <4 x i32> %ret +} + +define void @store_v4i32(<4 x i32> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4i32( +; CHECK-SAME: <4 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x i32> %data, ptr addrspace(7) %p + ret void +} + +define <2 x i16> @load_v2i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i16> @load_v2i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i16>, ptr addrspace(7) %p + ret <2 x i16> %ret +} + +define void @store_v2i16(<2 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i16( +; CHECK-SAME: <2 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <4 x i16> @load_v4i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x i16> @load_v4i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x i16>, ptr addrspace(7) %p + ret <4 x i16> %ret +} + +define void @store_v4i16(<4 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4i16( +; CHECK-SAME: <4 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <8 x i16> @load_v8i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <8 x i16> @load_v8i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <8 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x i16>, ptr addrspace(7) %p + ret <8 x i16> %ret +} + +define void @store_v8i16(<8 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v8i16( +; CHECK-SAME: <8 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <2 x i64> @load_v2i64(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i64> @load_v2i64( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i64> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i64>, ptr addrspace(7) %p + ret <2 x i64> %ret +} + +define void @store_v2i64(<2 x i64> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i64( +; CHECK-SAME: <2 x i64> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i64(<2 x i64> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i64> %data, ptr addrspace(7) %p + ret void +} + +define half @load_f16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define half @load_f16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret half [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load half, ptr addrspace(7) %p + ret half %ret +} + +define void @store_f16(half %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_f16( +; CHECK-SAME: half [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store half %data, ptr addrspace(7) %p + ret void +} + +define bfloat @load_bf16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define bfloat @load_bf16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.bf16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret bfloat [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load bfloat, ptr addrspace(7) %p + ret bfloat %ret +} + +define void @store_bf16(bfloat %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_bf16( +; CHECK-SAME: bfloat [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store bfloat %data, ptr addrspace(7) %p + ret void +} + +define <2 x half> @load_v2f16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x half> @load_v2f16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x half> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x half>, ptr addrspace(7) %p + ret <2 x half> %ret +} + +define void @store_v2f16(<2 x half> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2f16( +; CHECK-SAME: <2 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x half> %data, ptr addrspace(7) %p + ret void +} + +define <4 x bfloat> @load_v4bf16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x bfloat> @load_v4bf16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x bfloat> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x bfloat>, ptr addrspace(7) %p + ret <4 x bfloat> %ret +} + +define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4bf16( +; CHECK-SAME: <4 x bfloat> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x bfloat> %data, ptr addrspace(7) %p + ret void +} + +define <8 x half> @load_v8f16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <8 x half> @load_v8f16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <8 x half> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x half>, ptr addrspace(7) %p + ret <8 x half> %ret +} + +define void @store_v8f16(<8 x half> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v8f16( +; CHECK-SAME: <8 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x half> %data, ptr addrspace(7) %p + ret void +} + +define float @load_f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define float @load_f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret float [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load float, ptr addrspace(7) %p + ret float %ret +} + +define void @store_f32(float %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_f32( +; CHECK-SAME: float [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store float %data, ptr addrspace(7) %p + ret void +} + +define <2 x float> @load_v2f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x float> @load_v2f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x float>, ptr addrspace(7) %p + ret <2 x float> %ret +} + +define void @store_v2f32(<2 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2f32( +; CHECK-SAME: <2 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x float> %data, ptr addrspace(7) %p + ret void +} + +define <3 x float> @load_v3f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <3 x float> @load_v3f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <3 x float> @llvm.amdgcn.raw.ptr.buffer.load.v3f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <3 x float>, ptr addrspace(7) %p + ret <3 x float> %ret +} + +define void @store_v3f32(<3 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v3f32( +; CHECK-SAME: <3 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <3 x float> %data, ptr addrspace(7) %p + ret void +} + +define <4 x float> @load_v4f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x float> @load_v4f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x float>, ptr addrspace(7) %p + ret <4 x float> %ret +} + +define void @store_v4f32(<4 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4f32( +; CHECK-SAME: <4 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x float> %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(0) @load_p0(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr @load_p0( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(0), ptr addrspace(7) %p + ret ptr addrspace(0) %ret +} + +define void @store_p0(ptr addrspace(0) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p0( +; CHECK-SAME: ptr [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(0) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(1) @load_p1(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(1) @load_p1( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(1) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(1), ptr addrspace(7) %p + ret ptr addrspace(1) %ret +} + +define void @store_p1(ptr addrspace(1) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p1( +; CHECK-SAME: ptr addrspace(1) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(1) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(2) @load_p2(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(2) @load_p2( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(2) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(2), ptr addrspace(7) %p + ret ptr addrspace(2) %ret +} + +define void @store_p2(ptr addrspace(2) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p2( +; CHECK-SAME: ptr addrspace(2) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p2(ptr addrspace(2) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(2) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(3) @load_p3(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(3) @load_p3( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(3) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(3), ptr addrspace(7) %p + ret ptr addrspace(3) %ret +} + +define void @store_p3(ptr addrspace(3) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p3( +; CHECK-SAME: ptr addrspace(3) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p3(ptr addrspace(3) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(3) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(4) @load_p4(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(4) @load_p4( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(4) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(4), ptr addrspace(7) %p + ret ptr addrspace(4) %ret +} + +define void @store_p4(ptr addrspace(4) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p4( +; CHECK-SAME: ptr addrspace(4) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(4) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(5) @load_p5(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(5) @load_p5( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(5) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(5), ptr addrspace(7) %p + ret ptr addrspace(5) %ret +} + +define void @store_p5(ptr addrspace(5) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p5( +; CHECK-SAME: ptr addrspace(5) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(5) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(6) @load_p6(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(6) @load_p6( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(6) @llvm.amdgcn.raw.ptr.buffer.load.p6(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(6) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(6), ptr addrspace(7) %p + ret ptr addrspace(6) %ret +} + +define void @store_p6(ptr addrspace(6) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p6( +; CHECK-SAME: ptr addrspace(6) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p6(ptr addrspace(6) [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(6) %data, ptr addrspace(7) %p + ret void +} + +define ptr addrspace(8) @load_p8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define ptr addrspace(8) @load_p8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.raw.ptr.buffer.load.p8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(8) [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load ptr addrspace(8), ptr addrspace(7) %p + ret ptr addrspace(8) %ret +} + +define void @store_p8(ptr addrspace(8) %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_p8( +; CHECK-SAME: ptr addrspace(8) [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.p8(ptr addrspace(8) [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store ptr addrspace(8) %data, ptr addrspace(7) %p + ret void +} + +define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x ptr addrspace(1)> @load_v2p1( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x ptr addrspace(1)> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p + ret <2 x ptr addrspace(1)> %ret +} + +define void @store_v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2p1( +; CHECK-SAME: <2 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x ptr addrspace(1)> %data, ptr addrspace(7) %p + ret void +} + +define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x ptr addrspace(5)> @load_v2p5( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v2p5(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x ptr addrspace(5)> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p + ret <2 x ptr addrspace(5)> %ret +} + +define void @store_v2p5(<2 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2p5( +; CHECK-SAME: <2 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2p5(<2 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x ptr addrspace(5)> %data, ptr addrspace(7) %p + ret void +} + +define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <3 x ptr addrspace(5)> @load_v3p5( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <3 x ptr addrspace(5)> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p + ret <3 x ptr addrspace(5)> %ret +} + +define void @store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v3p5( +; CHECK-SAME: <3 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <3 x ptr addrspace(5)> %data, ptr addrspace(7) %p + ret void +} + +define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x ptr addrspace(5)> @load_v4p5( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x ptr addrspace(5)> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p + ret <4 x ptr addrspace(5)> %ret +} + +define void @store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4p5( +; CHECK-SAME: <4 x ptr addrspace(5)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x ptr addrspace(5)> %data, ptr addrspace(7) %p + ret void +} + +;;; 3 words in a short type. These need to be bitcast to <3 x i32> to be supported. + +define <6 x half> @load_v6f16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x half> @load_v6f16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <6 x half> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <6 x half>, ptr addrspace(7) %p + ret <6 x half> %ret +} + +define void @store_v6f16(<6 x half> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v6f16( +; CHECK-SAME: <6 x half> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f16(<6 x half> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <6 x half> %data, ptr addrspace(7) %p + ret void +} + +;;; Long types (32 bit elements). Must be split into multiple operations. + +define <5 x float> @load_v5f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <5 x float> @load_v5f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <5 x float> @llvm.amdgcn.raw.ptr.buffer.load.v5f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <5 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <5 x float>, ptr addrspace(7) %p + ret <5 x float> %ret +} + +define void @store_v5f32(<5 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v5f32( +; CHECK-SAME: <5 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5f32(<5 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <5 x float> %data, ptr addrspace(7) %p + ret void +} + +define <6 x float> @load_v6f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x float> @load_v6f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x float> @llvm.amdgcn.raw.ptr.buffer.load.v6f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <6 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <6 x float>, ptr addrspace(7) %p + ret <6 x float> %ret +} + +define void @store_v6f32(<6 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v6f32( +; CHECK-SAME: <6 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6f32(<6 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <6 x float> %data, ptr addrspace(7) %p + ret void +} + +define <7 x float> @load_v7f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <7 x float> @load_v7f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <7 x float> @llvm.amdgcn.raw.ptr.buffer.load.v7f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <7 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <7 x float>, ptr addrspace(7) %p + ret <7 x float> %ret +} + +define void @store_v7f32(<7 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v7f32( +; CHECK-SAME: <7 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7f32(<7 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <7 x float> %data, ptr addrspace(7) %p + ret void +} + +define <8 x float> @load_v8f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <8 x float> @load_v8f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <8 x float> @llvm.amdgcn.raw.ptr.buffer.load.v8f32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <8 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x float>, ptr addrspace(7) %p + ret <8 x float> %ret +} + +define void @store_v8f32(<8 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v8f32( +; CHECK-SAME: <8 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f32(<8 x float> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x float> %data, ptr addrspace(7) %p + ret void +} + +define <10 x float> @load_v10f32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <10 x float> @load_v10f32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <10 x float> @llvm.amdgcn.raw.ptr.buffer.load.v10f32(ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <10 x float> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <10 x float>, ptr addrspace(7) %p + ret <10 x float> %ret +} + +define void @store_v10f32(<10 x float> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v10f32( +; CHECK-SAME: <10 x float> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v10f32(<10 x float> [[DATA]], ptr addrspace(8) align 64 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <10 x float> %data, ptr addrspace(7) %p + ret void +} + +define <6 x i32> @load_v6i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x i32> @load_v6i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v6i32(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <6 x i32> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <6 x i32>, ptr addrspace(7) %p + ret <6 x i32> %ret +} + +define void @store_v6i32(<6 x i32> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v6i32( +; CHECK-SAME: <6 x i32> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i32(<6 x i32> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <6 x i32> %data, ptr addrspace(7) %p + ret void +} + +define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x ptr addrspace(1)> @load_v4p1( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v4p1(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x ptr addrspace(1)> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p + ret <4 x ptr addrspace(1)> %ret +} + +define void @store_v4p1(<4 x ptr addrspace(1)> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4p1( +; CHECK-SAME: <4 x ptr addrspace(1)> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4p1(<4 x ptr addrspace(1)> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x ptr addrspace(1)> %data, ptr addrspace(7) %p + ret void +} + +;;; Uneven types with 16-bit elements. Require splitting into multiple operations. + +define <1 x i16> @load_v1i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <1 x i16> @load_v1i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <1 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v1i16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <1 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <1 x i16>, ptr addrspace(7) %p + ret <1 x i16> %ret +} + +define void @store_v1i16(<1 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v1i16( +; CHECK-SAME: <1 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i16(<1 x i16> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <1 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <3 x i16> @load_v3i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <3 x i16> @load_v3i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <3 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v3i16(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <3 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <3 x i16>, ptr addrspace(7) %p + ret <3 x i16> %ret +} + +define void @store_v3i16(<3 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v3i16( +; CHECK-SAME: <3 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i16(<3 x i16> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <3 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <5 x i16> @load_v5i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <5 x i16> @load_v5i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <5 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v5i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <5 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <5 x i16>, ptr addrspace(7) %p + ret <5 x i16> %ret +} + +define void @store_v5i16(<5 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v5i16( +; CHECK-SAME: <5 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i16(<5 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <5 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <6 x i16> @load_v6i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x i16> @load_v6i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <6 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <6 x i16>, ptr addrspace(7) %p + ret <6 x i16> %ret +} + +define void @store_v6i16(<6 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v6i16( +; CHECK-SAME: <6 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <6 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <7 x i16> @load_v7i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <7 x i16> @load_v7i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <7 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v7i16(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <7 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <7 x i16>, ptr addrspace(7) %p + ret <7 x i16> %ret +} + +define void @store_v7i16(<7 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v7i16( +; CHECK-SAME: <7 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i16(<7 x i16> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <7 x i16> %data, ptr addrspace(7) %p + ret void +} + +define <9 x i16> @load_v9i16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <9 x i16> @load_v9i16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <9 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v9i16(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <9 x i16> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <9 x i16>, ptr addrspace(7) %p + ret <9 x i16> %ret +} + +define void @store_v9i16(<9 x i16> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v9i16( +; CHECK-SAME: <9 x i16> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v9i16(<9 x i16> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <9 x i16> %data, ptr addrspace(7) %p + ret void +} + +;;; Byte vectors. Need to be +;;; - Split into multiple operations +;;; - Bitcast if they have a natively supported width + +define <1 x i8> @load_v1i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <1 x i8> @load_v1i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <1 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v1i8(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <1 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <1 x i8>, ptr addrspace(7) %p + ret <1 x i8> %ret +} + +define void @store_v1i8(<1 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v1i8( +; CHECK-SAME: <1 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v1i8(<1 x i8> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <1 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <2 x i8> @load_v2i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i8> @load_v2i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v2i8(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i8>, ptr addrspace(7) %p + ret <2 x i8> %ret +} + +define void @store_v2i8(<2 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i8( +; CHECK-SAME: <2 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i8(<2 x i8> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <3 x i8> @load_v3i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <3 x i8> @load_v3i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <3 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v3i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <3 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <3 x i8>, ptr addrspace(7) %p + ret <3 x i8> %ret +} + +define void @store_v3i8(<3 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v3i8( +; CHECK-SAME: <3 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v3i8(<3 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <3 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <4 x i8> @load_v4i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x i8> @load_v4i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x i8>, ptr addrspace(7) %p + ret <4 x i8> %ret +} + +define void @store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4i8( +; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <5 x i8> @load_v5i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <5 x i8> @load_v5i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <5 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v5i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <5 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <5 x i8>, ptr addrspace(7) %p + ret <5 x i8> %ret +} + +define void @store_v5i8(<5 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v5i8( +; CHECK-SAME: <5 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v5i8(<5 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <5 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <6 x i8> @load_v6i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x i8> @load_v6i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <6 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <6 x i8>, ptr addrspace(7) %p + ret <6 x i8> %ret +} + +define void @store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v6i8( +; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <6 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <7 x i8> @load_v7i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <7 x i8> @load_v7i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <7 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v7i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <7 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <7 x i8>, ptr addrspace(7) %p + ret <7 x i8> %ret +} + +define void @store_v7i8(<7 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v7i8( +; CHECK-SAME: <7 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v7i8(<7 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <7 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <8 x i8> @load_v8i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <8 x i8> @load_v8i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <8 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v8i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <8 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x i8>, ptr addrspace(7) %p + ret <8 x i8> %ret +} + +define void @store_v8i8(<8 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v8i8( +; CHECK-SAME: <8 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i8(<8 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <12 x i8> @load_v12i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <12 x i8> @load_v12i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <12 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v12i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <12 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <12 x i8>, ptr addrspace(7) %p + ret <12 x i8> %ret +} + +define void @store_v12i8(<12 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v12i8( +; CHECK-SAME: <12 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v12i8(<12 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <12 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <16 x i8> @load_v16i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <16 x i8> @load_v16i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <16 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v16i8(ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <16 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <16 x i8>, ptr addrspace(7) %p + ret <16 x i8> %ret +} + +define void @store_v16i8(<16 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v16i8( +; CHECK-SAME: <16 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v16i8(<16 x i8> [[DATA]], ptr addrspace(8) align 16 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <16 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <32 x i8> @load_v32i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <32 x i8> @load_v32i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <32 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v32i8(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <32 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <32 x i8>, ptr addrspace(7) %p + ret <32 x i8> %ret +} + +define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v32i8( +; CHECK-SAME: <32 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i8(<32 x i8> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <32 x i8> %data, ptr addrspace(7) %p + ret void +} + +;;; Arrays. Need to become vectors. + +define [1 x i32] @load_a1i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define [1 x i32] @load_a1i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call [1 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a1i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret [1 x i32] [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load [1 x i32], ptr addrspace(7) %p + ret [1 x i32] %ret +} + +define void @store_a1i32([1 x i32] %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_a1i32( +; CHECK-SAME: [1 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a1i32([1 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store [1 x i32] %data, ptr addrspace(7) %p + ret void +} + +define [2 x i32] @load_a2i32(ptr addrspace(8) %buf) { +; CHECK-LABEL: define [2 x i32] @load_a2i32( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call [2 x i32] @llvm.amdgcn.raw.ptr.buffer.load.a2i32(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret [2 x i32] [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load [2 x i32], ptr addrspace(7) %p + ret [2 x i32] %ret +} + +define void @store_a2i32([2 x i32] %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_a2i32( +; CHECK-SAME: [2 x i32] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2i32([2 x i32] [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store [2 x i32] %data, ptr addrspace(7) %p + ret void +} + +define [2 x half] @load_a2f16(ptr addrspace(8) %buf) { +; CHECK-LABEL: define [2 x half] @load_a2f16( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call [2 x half] @llvm.amdgcn.raw.ptr.buffer.load.a2f16(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret [2 x half] [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load [2 x half], ptr addrspace(7) %p + ret [2 x half] %ret +} + +define void @store_a2f16([2 x half] %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_a2f16( +; CHECK-SAME: [2 x half] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2f16([2 x half] [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store [2 x half] %data, ptr addrspace(7) %p + ret void +} + +define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) %buf) { +; CHECK-LABEL: define [2 x ptr addrspace(1)] @load_a2p1( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call [2 x ptr addrspace(1)] @llvm.amdgcn.raw.ptr.buffer.load.a2p1(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret [2 x ptr addrspace(1)] [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p + ret [2 x ptr addrspace(1)] %ret +} + +define void @store_a2p1([2 x ptr addrspace(1)] %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_a2p1( +; CHECK-SAME: [2 x ptr addrspace(1)] [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.a2p1([2 x ptr addrspace(1)] [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store [2 x ptr addrspace(1)] %data, ptr addrspace(7) %p + ret void +} + +;;; Scalars of atypical width. Need to be cast to vectors and split. + +define i40 @load_i40(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i40 @load_i40( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i40 @llvm.amdgcn.raw.ptr.buffer.load.i40(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i40 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i40, ptr addrspace(7) %p + ret i40 %ret +} + +define void @store_i40(i40 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i40( +; CHECK-SAME: i40 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i40(i40 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i40 %data, ptr addrspace(7) %p + ret void +} + +define i96 @load_i96(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i96 @load_i96( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i96 @llvm.amdgcn.raw.ptr.buffer.load.i96(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i96 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i96, ptr addrspace(7) %p + ret i96 %ret +} + +define void @store_i96(i96 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i96( +; CHECK-SAME: i96 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i96(i96 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i96 %data, ptr addrspace(7) %p + ret void +} + +define i160 @load_i160(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i160 @load_i160( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i160 @llvm.amdgcn.raw.ptr.buffer.load.i160(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i160 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i160, ptr addrspace(7) %p + ret i160 %ret +} + +define void @store_i160(i160 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i160( +; CHECK-SAME: i160 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i160 %data, ptr addrspace(7) %p + ret void +} + +define i256 @load_i256(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i256 @load_i256( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i256 @llvm.amdgcn.raw.ptr.buffer.load.i256(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i256 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i256, ptr addrspace(7) %p + ret i256 %ret +} + +define void @store_i256(i256 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i256( +; CHECK-SAME: i256 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i256(i256 [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i256 %data, ptr addrspace(7) %p + ret void +} + +;;; Non-byte-sized scalars. Require zero-extension. + +define i7 @load_i4(ptr addrspace(8) %buf) { +; CHECK-LABEL: define i7 @load_i4( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call i7 @llvm.amdgcn.raw.ptr.buffer.load.i7(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret i7 [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load i7, ptr addrspace(7) %p + ret i7 %ret +} + +define void @store_i4(i7 %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_i4( +; CHECK-SAME: i7 [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i7(i7 [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store i7 %data, ptr addrspace(7) %p + ret void +} + +;;; Byte-sized vectors of i4. Require casts. + +define <2 x i4> @load_v2i4(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i4> @load_v2i4( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v2i4(ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i4> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i4>, ptr addrspace(7) %p + ret <2 x i4> %ret +} + +define void @store_v2i4(<2 x i4> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i4( +; CHECK-SAME: <2 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i4(<2 x i4> [[DATA]], ptr addrspace(8) align 1 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i4> %data, ptr addrspace(7) %p + ret void +} + +define <4 x i4> @load_v4i4(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x i4> @load_v4i4( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v4i4(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <4 x i4> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <4 x i4>, ptr addrspace(7) %p + ret <4 x i4> %ret +} + +define void @store_v4i4(<4 x i4> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v4i4( +; CHECK-SAME: <4 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i4(<4 x i4> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <4 x i4> %data, ptr addrspace(7) %p + ret void +} + +define <8 x i4> @load_v8i4(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <8 x i4> @load_v8i4( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <8 x i4> @llvm.amdgcn.raw.ptr.buffer.load.v8i4(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <8 x i4> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <8 x i4>, ptr addrspace(7) %p + ret <8 x i4> %ret +} + +define void @store_v8i4(<8 x i4> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v8i4( +; CHECK-SAME: <8 x i4> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i4(<8 x i4> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <8 x i4> %data, ptr addrspace(7) %p + ret void +} + +;;; Vectors of non-byte-sized integers. + +define <2 x i6> @load_v2i6(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <2 x i6> @load_v2i6( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <2 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v2i6(ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret <2 x i6> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <2 x i6>, ptr addrspace(7) %p + ret <2 x i6> %ret +} + +define void @store_v2i6(<2 x i6> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v2i6( +; CHECK-SAME: <2 x i6> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i6(<2 x i6> [[DATA]], ptr addrspace(8) align 2 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <2 x i6> %data, ptr addrspace(7) %p + ret void +} + +;; Blocks of fp6 elements +define <6 x i32> @load_v32i6(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x i32> @load_v32i6( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <32 x i6> @llvm.amdgcn.raw.ptr.buffer.load.v32i6(ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[RET_CAST:%.*]] = bitcast <32 x i6> [[RET]] to <6 x i32> +; CHECK-NEXT: ret <6 x i32> [[RET_CAST]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load <32 x i6>, ptr addrspace(7) %p + %ret.cast = bitcast <32 x i6> %ret to <6 x i32> + ret <6 x i32> %ret.cast +} + +define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @store_v32i6( +; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6> +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v32i6(<32 x i6> [[DATA]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) +; CHECK-NEXT: ret void +; + %data = bitcast <6 x i32> %data.abi to <32 x i6> + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store <32 x i6> %data, ptr addrspace(7) %p + ret void +} + +;;; Modifiers + +define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <4 x i8> @volatile_load_v4i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <4 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v4i8(ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648) +; CHECK-NEXT: ret <4 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load volatile <4 x i8>, ptr addrspace(7) %p + ret <4 x i8> %ret +} + +define void @volatile_store_v4i8(<4 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @volatile_store_v4i8( +; CHECK-SAME: <4 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i8(<4 x i8> [[DATA]], ptr addrspace(8) align 4 [[BUF]], i32 0, i32 0, i32 -2147483648) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store volatile <4 x i8> %data, ptr addrspace(7) %p + ret void +} + +define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) %buf) { +; CHECK-LABEL: define <6 x i8> @volatile_load_v6i8( +; CHECK-SAME: ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call <6 x i8> @llvm.amdgcn.raw.ptr.buffer.load.v6i8(ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648) +; CHECK-NEXT: ret <6 x i8> [[RET]] +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %ret = load volatile <6 x i8>, ptr addrspace(7) %p + ret <6 x i8> %ret +} + +define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @volatile_store_v6i8( +; CHECK-SAME: <6 x i8> [[DATA:%.*]], ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v6i8(<6 x i8> [[DATA]], ptr addrspace(8) align 8 [[BUF]], i32 0, i32 0, i32 -2147483648) +; CHECK-NEXT: ret void +; + %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + store volatile <6 x i8> %data, ptr addrspace(7) %p + ret void +} From 0547e573c555445e37db5c3bc92ee72274e19b69 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 30 Sep 2024 17:08:44 -0400 Subject: [PATCH 025/151] [runtimes] Run backdeployment CI on Github hosted runners (#109984) This removes the need for macOS nodes in Buildkite. It also moves to the proper way of testing backdeployment, which is to actually run on the target OS itself, instead of using packaged dylibs from previous OS versions and trying to emulate backdeployment with DYLD_LIBRARY_PATH. As a drive-by change, also fix a few back-deployment annotations that were incorrect and add support for minor versions in the Lit feature determining availability from the target triple. --- .github/workflows/libcxx-build-and-test.yaml | 23 +-- .../apple-libc++-backdeployment.cfg.in | 65 --------- .../test/configs/apple-libc++-system.cfg.in | 33 +++++ .../filebuf.members/open_pointer.pass.cpp | 3 + .../fstreams/filebuf.virtuals/setbuf.pass.cpp | 3 + .../fstreams/fstream.cons/pointer.pass.cpp | 3 + .../fstream.members/open_pointer.pass.cpp | 3 + .../fstreams/ofstream.cons/pointer.pass.cpp | 3 + .../ofstream.members/open_pointer.pass.cpp | 3 + .../stringstream.members/gcount.pass.cpp | 3 + libcxx/utils/ci/buildkite-pipeline.yml | 42 ------ libcxx/utils/ci/run-buildbot | 132 ++++++++++-------- libcxx/utils/libcxx/test/features.py | 34 ++--- .../apple-libc++abi-backdeployment.cfg.in | 66 --------- .../configs/apple-libc++abi-system.cfg.in | 34 +++++ libcxxabi/test/test_demangle.pass.cpp | 5 +- .../apple-libunwind-backdeployment.cfg.in | 67 --------- .../configs/apple-libunwind-system.cfg.in | 41 ++++++ 18 files changed, 236 insertions(+), 327 deletions(-) delete mode 100644 libcxx/test/configs/apple-libc++-backdeployment.cfg.in create mode 100644 libcxx/test/configs/apple-libc++-system.cfg.in delete mode 100644 libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in create mode 100644 libcxxabi/test/configs/apple-libc++abi-system.cfg.in delete mode 100644 libunwind/test/configs/apple-libunwind-backdeployment.cfg.in create mode 100644 libunwind/test/configs/apple-libunwind-system.cfg.in diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index b5e60781e0006..184fed2268e81 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -193,17 +193,24 @@ jobs: **/crash_diagnostics/* macos: - runs-on: macos-14 needs: [ stage1 ] strategy: - fail-fast: true + fail-fast: false matrix: - config: [ - generic-cxx03, - generic-cxx23, - generic-modules, - apple-configuration - ] + include: + - config: generic-cxx03 + os: macos-latest + - config: generic-cxx23 + os: macos-latest + - config: generic-modules + os: macos-latest + - config: apple-configuration + os: macos-latest + - config: apple-system + os: macos-13 + - config: apple-system-hardened + os: macos-13 + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: maxim-lobanov/setup-xcode@v1 diff --git a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in b/libcxx/test/configs/apple-libc++-backdeployment.cfg.in deleted file mode 100644 index 9843c4a9ad70d..0000000000000 --- a/libcxx/test/configs/apple-libc++-backdeployment.cfg.in +++ /dev/null @@ -1,65 +0,0 @@ -# Testing configuration for back-deployment against older Apple system libc++. -# -# Under this configuration, we compile and link all the test suite against the latest libc++, -# however we run against the libc++ on a different platform. This emulates the workflow of -# a developer building their application using recent tools but with the goal of deploying -# on existing devices running an older OS (and hence an older dylib). - -import os, site -site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils')) -import libcxx.test.params, libcxx.test.config, libcxx.test.dsl - -lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') - -BACKDEPLOYMENT_PARAMETERS = [ - libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++) when running tests. - - This should be a directory hierarchy under which the libc++ dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++abi) when running tests. - - This should be a directory hierarchy under which the libc++abi dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)], - help=""" - The simulated root of the system (for libunwind) when running tests. - - This should be a directory hierarchy under which the libunwind dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), -] - -config.substitutions.append(('%{flags}', - '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' -)) -config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support' -)) -config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib-dir} -lc++' -)) -config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- ' -)) - -config.stdlib = 'apple-libc++' -config.using_system_stdlib = True - -libcxx.test.config.configure( - libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS, - libcxx.test.features.DEFAULT_FEATURES, - config, - lit_config -) diff --git a/libcxx/test/configs/apple-libc++-system.cfg.in b/libcxx/test/configs/apple-libc++-system.cfg.in new file mode 100644 index 0000000000000..b59506f375c4a --- /dev/null +++ b/libcxx/test/configs/apple-libc++-system.cfg.in @@ -0,0 +1,33 @@ +# Testing configuration for back-deployment against the system-provided libc++. +# +# Under this configuration, we compile and link all the test suite against the just-built +# libc++, but we run against the system libc++. + +import os, site +site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils')) +import libcxx.test.params, libcxx.test.config, libcxx.test.dsl + +lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') + +config.substitutions.append(('%{flags}', + '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' +)) +config.substitutions.append(('%{compile_flags}', + '-nostdinc++ -I %{include-dir} -I %{libcxx-dir}/test/support' +)) +config.substitutions.append(('%{link_flags}', + '-nostdlib++ -L %{lib-dir} -lc++' +)) +config.substitutions.append(('%{exec}', + '%{executor} --execdir %T -- ' +)) + +config.stdlib = 'apple-libc++' +config.using_system_stdlib = True + +libcxx.test.config.configure( + libcxx.test.params.DEFAULT_PARAMETERS, + libcxx.test.features.DEFAULT_FEATURES, + config, + lit_config +) diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp index f070762b3b94d..9f617dc1e5a89 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.members/open_pointer.pass.cpp @@ -10,6 +10,9 @@ // basic_filebuf* open(const char* s, ios_base::openmode mode); +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + // XFAIL: LIBCXX-AIX-FIXME #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp index 8bcce28162033..10435dc482367 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp @@ -10,6 +10,9 @@ // basic_streambuf* setbuf(char_type* s, streamsize n) override; +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + #include #include #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp index df7d3b948e327..2e0ebcd684d79 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/pointer.pass.cpp @@ -13,6 +13,9 @@ // explicit basic_fstream(const char* s, ios_base::openmode mode = ios_base::in | ios_base::out); +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + // XFAIL: LIBCXX-AIX-FIXME #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp index 790b9ef02f902..0d83d681b1dfc 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.members/open_pointer.pass.cpp @@ -13,6 +13,9 @@ // void open(const char* s, ios_base::openmode mode = ios_base::in|ios_base::out); +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + // XFAIL: LIBCXX-AIX-FIXME #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp index af43ffdbf8006..fbb03f1e85841 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/pointer.pass.cpp @@ -13,6 +13,9 @@ // explicit basic_ofstream(const char* s, ios_base::openmode mode = ios_base::out); +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + // XFAIL: LIBCXX-AIX-FIXME #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp index b0a68fd4340b7..73a474277a933 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/open_pointer.pass.cpp @@ -13,6 +13,9 @@ // void open(const char* s, ios_base::openmode mode = ios_base::out); +// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) +// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 + // XFAIL: LIBCXX-AIX-FIXME #include diff --git a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp index a9079dc63b6b5..84004dc12f11a 100644 --- a/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp +++ b/libcxx/test/std/input.output/string.streams/stringstream/stringstream.members/gcount.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: 32-bit-pointer // REQUIRES: large_tests +// This bug was fixed in the dylib by 53aed4759b33e33614e0f4e321bc1ef764b6d5b6. +// XFAIL: using-built-library-before-llvm-17 + // Android devices frequently don't have enough memory to run this test. Rather // than throw std::bad_alloc, exhausting memory triggers the OOM Killer. // UNSUPPORTED: LIBCXX-ANDROID-FIXME diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml index 906df734bc42b..d1465721cf164 100644 --- a/libcxx/utils/ci/buildkite-pipeline.yml +++ b/libcxx/utils/ci/buildkite-pipeline.yml @@ -37,48 +37,6 @@ definitions: - "**/crash_diagnostics/*" steps: -- group: ':mac: Apple' - steps: - - label: Apple back-deployment macosx10.13 - command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.13 - agents: - queue: libcxx-builders - os: macos - arch: x86_64 # We need to use x86_64 for back-deployment CI on this target since macOS didn't support arm64 back then - <<: *common - - - label: Apple back-deployment macosx10.15 - command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.15 - agents: - queue: libcxx-builders - os: macos - arch: x86_64 # We need to use x86_64 for back-deployment CI on this target since macOS didn't support arm64 back then - <<: *common - - - label: Apple back-deployment with hardening enabled - command: libcxx/utils/ci/run-buildbot apple-system-backdeployment-hardened-11.0 - agents: - queue: libcxx-builders - os: macos - arch: x86_64 # TODO: Remove this once we are able to run back-deployment on arm64 again, since this isn't x86_64 specific - <<: *common - - # TODO: Re-enable this once we've figured out how to run back-deployment testing on arm64 on recent OSes - # - label: "Apple back-deployment macosx11.0 arm64" - # command: "libcxx/utils/ci/run-buildbot apple-system-backdeployment-11.0" - # artifact_paths: - # - "**/test-results.xml" - # - "**/*.abilist" - # agents: - # queue: "libcxx-builders" - # os: "macos" - # arch: "arm64" - # retry: - # automatic: - # - exit_status: -1 # Agent was lost - # limit: 2 - # timeout_in_minutes: 120 - - group: ARM steps: - label: AArch64 diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index b0533cb9a49c9..a82d121838703 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -522,80 +522,94 @@ apple-configuration) # TODO: It would be better to run the tests against the fake-installed version of libc++ instead xcrun --sdk macosx ninja -vC "${BUILD_DIR}/${arch}" check-cxx check-cxxabi check-cxx-abilist ;; -apple-system-backdeployment-hardened-*) +apple-system-hardened) clean - if [[ "${OSX_ROOTS}" == "" ]]; then - echo "--- Downloading previous macOS dylibs" - PREVIOUS_DYLIBS_URL="https://dl.dropboxusercontent.com/s/gmcfxwgl9f9n6pu/libcxx-roots.tar.gz" - OSX_ROOTS="${BUILD_DIR}/macos-roots" - mkdir -p "${OSX_ROOTS}" - curl "${PREVIOUS_DYLIBS_URL}" | tar -xz --strip-components=1 -C "${OSX_ROOTS}" - fi + arch="$(uname -m)" + version="$(sw_vers --productVersion)" + params="target_triple=${arch}-apple-macosx${version}" + params+=";hardening_mode=fast" - DEPLOYMENT_TARGET="${BUILDER#apple-system-backdeployment-hardened-}" + # In the Apple system configuration, we build libc++ and libunwind separately. + ${CMAKE} \ + -S "${MONOREPO_ROOT}/runtimes" \ + -B "${BUILD_DIR}/cxx" \ + -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \ + -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ + -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ + -DLIBCXX_CXX_ABI=libcxxabi \ + -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \ + -DLIBCXX_TEST_CONFIG="apple-libc++-system.cfg.in" \ + -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-system.cfg.in" \ + -DLIBCXX_TEST_PARAMS="${params}" \ + -DLIBCXXABI_TEST_PARAMS="${params}" - # TODO: On Apple platforms, we never produce libc++abi.1.dylib or libunwind.1.dylib, - # only libc++abi.dylib and libunwind.dylib. Fix that in the build so that the - # tests stop searching for @rpath/libc++abi.1.dylib and @rpath/libunwind.1.dylib. - cp "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.dylib" \ - "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.1.dylib" - cp "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.dylib" \ - "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.1.dylib" + ${CMAKE} \ + -S "${MONOREPO_ROOT}/runtimes" \ + -B "${BUILD_DIR}/unwind" \ + -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \ + -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ + -DLLVM_ENABLE_RUNTIMES="libunwind" \ + -DLIBUNWIND_TEST_CONFIG="apple-libunwind-system.cfg.in" \ + -DLIBUNWIND_TEST_PARAMS="${params}" \ + -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system" - arch="$(uname -m)" - PARAMS="target_triple=${arch}-apple-macosx${DEPLOYMENT_TARGET}" - PARAMS+=";cxx_runtime_root=${OSX_ROOTS}/macOS/libc++/${DEPLOYMENT_TARGET}" - PARAMS+=";abi_runtime_root=${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}" - PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}" - PARAMS+=";hardening_mode=fast" + echo "+++ Running the libc++ tests" + ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \ - -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \ - -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-backdeployment.cfg.in" \ - -DLIBUNWIND_TEST_CONFIG="apple-libunwind-backdeployment.cfg.in" \ - -DLIBCXX_TEST_PARAMS="${PARAMS}" \ - -DLIBCXXABI_TEST_PARAMS="${PARAMS}" \ - -DLIBUNWIND_TEST_PARAMS="${PARAMS}" + echo "+++ Running the libc++abi tests" + ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi - check-runtimes + echo "+++ Running the libunwind tests" + ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind ;; -apple-system-backdeployment-*) +apple-system) clean - if [[ "${OSX_ROOTS}" == "" ]]; then - echo "--- Downloading previous macOS dylibs" - PREVIOUS_DYLIBS_URL="https://dl.dropboxusercontent.com/s/gmcfxwgl9f9n6pu/libcxx-roots.tar.gz" - OSX_ROOTS="${BUILD_DIR}/macos-roots" - mkdir -p "${OSX_ROOTS}" - curl "${PREVIOUS_DYLIBS_URL}" | tar -xz --strip-components=1 -C "${OSX_ROOTS}" - fi + arch="$(uname -m)" + version="$(sw_vers --productVersion)" + params="target_triple=${arch}-apple-macosx${version}" - DEPLOYMENT_TARGET="${BUILDER#apple-system-backdeployment-}" + # In the Apple system configuration, we build libc++ and libunwind separately. + ${CMAKE} \ + -S "${MONOREPO_ROOT}/runtimes" \ + -B "${BUILD_DIR}/cxx" \ + -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \ + -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ + -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ + -DLIBCXX_CXX_ABI=libcxxabi \ + -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \ + -DLIBCXX_TEST_CONFIG="apple-libc++-system.cfg.in" \ + -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-system.cfg.in" \ + -DLIBCXX_TEST_PARAMS="${params}" \ + -DLIBCXXABI_TEST_PARAMS="${params}" - # TODO: On Apple platforms, we never produce libc++abi.1.dylib or libunwind.1.dylib, - # only libc++abi.dylib and libunwind.dylib. Fix that in the build so that the - # tests stop searching for @rpath/libc++abi.1.dylib and @rpath/libunwind.1.dylib. - cp "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.dylib" \ - "${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}/libc++abi.1.dylib" - cp "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.dylib" \ - "${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}/libunwind.1.dylib" + ${CMAKE} \ + -S "${MONOREPO_ROOT}/runtimes" \ + -B "${BUILD_DIR}/unwind" \ + -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \ + -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ + -DLLVM_ENABLE_RUNTIMES="libunwind" \ + -DLIBUNWIND_TEST_CONFIG="apple-libunwind-system.cfg.in" \ + -DLIBUNWIND_TEST_PARAMS="${params}" \ + -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system" - arch="$(uname -m)" - PARAMS="target_triple=${arch}-apple-macosx${DEPLOYMENT_TARGET}" - PARAMS+=";cxx_runtime_root=${OSX_ROOTS}/macOS/libc++/${DEPLOYMENT_TARGET}" - PARAMS+=";abi_runtime_root=${OSX_ROOTS}/macOS/libc++abi/${DEPLOYMENT_TARGET}" - PARAMS+=";unwind_runtime_root=${OSX_ROOTS}/macOS/libunwind/${DEPLOYMENT_TARGET}" + echo "+++ Running the libc++ tests" + ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \ - -DLIBCXX_TEST_CONFIG="apple-libc++-backdeployment.cfg.in" \ - -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-backdeployment.cfg.in" \ - -DLIBUNWIND_TEST_CONFIG="apple-libunwind-backdeployment.cfg.in" \ - -DLIBCXX_TEST_PARAMS="${PARAMS}" \ - -DLIBCXXABI_TEST_PARAMS="${PARAMS}" \ - -DLIBUNWIND_TEST_PARAMS="${PARAMS}" + echo "+++ Running the libc++abi tests" + ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi - check-runtimes + echo "+++ Running the libunwind tests" + ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind ;; benchmarks) clean diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 29810c8ffee53..735eb5ac949dc 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -620,21 +620,21 @@ def check_gdb(cfg): Feature( name="_target-has-llvm-17", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.0)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}", + "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.[0-9]+)?}} || target={{.+}}-apple-macosx{{1[5-9]([.].+)?}}", cfg.available_features, ), ), Feature( name="_target-has-llvm-16", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.0)?}}", + "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}", cfg.available_features, ), ), Feature( name="_target-has-llvm-15", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.0)?}}", + "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}", cfg.available_features, ), ), @@ -648,21 +648,21 @@ def check_gdb(cfg): Feature( name="_target-has-llvm-13", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.0)?}}", + "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}", cfg.available_features, ), ), Feature( name="_target-has-llvm-12", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.0)?}}", + "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}", cfg.available_features, ), ), Feature( name="_target-has-llvm-11", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-12 || target={{.+}}-apple-macosx{{(11.[0-9]|12.[0-2])(.0)?}}", + "_target-has-llvm-12 || target={{.+}}-apple-macosx{{(11.[0-9]|12.[0-2])(.[0-9]+)?}}", cfg.available_features, ), ), @@ -676,7 +676,7 @@ def check_gdb(cfg): Feature( name="_target-has-llvm-9", when=lambda cfg: BooleanExpression.evaluate( - "_target-has-llvm-10 || target={{.+}}-apple-macosx{{10.15(.0)?}}", + "_target-has-llvm-10 || target={{.+}}-apple-macosx{{10.15(.[0-9]+)?}}", cfg.available_features, ), ), @@ -719,7 +719,7 @@ def check_gdb(cfg): # a libc++ flavor that enables availability markup. Similarly, a test could fail when # run against the system library of an older version of FreeBSD, even though FreeBSD # doesn't provide availability markup at the time of writing this. -for version in ("9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"): +for version in ("9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"): DEFAULT_FEATURES.append( Feature( name="using-built-library-before-llvm-{}".format(version), @@ -755,27 +755,27 @@ def check_gdb(cfg): cfg.available_features, ), ), - # Tests that require __libcpp_verbose_abort support in the built library + # Tests that require std::to_chars(floating-point) in the built library Feature( - name="availability-verbose_abort-missing", + name="availability-fp_to_chars-missing", when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)", + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)", cfg.available_features, ), ), - # Tests that require std::pmr support in the built library + # Tests that require __libcpp_verbose_abort support in the built library Feature( - name="availability-pmr-missing", + name="availability-verbose_abort-missing", when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-13)", + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)", cfg.available_features, ), ), - # Tests that require std::to_chars(floating-point) in the built library + # Tests that require std::pmr support in the built library Feature( - name="availability-fp_to_chars-missing", + name="availability-pmr-missing", when=lambda cfg: BooleanExpression.evaluate( - "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)", + "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)", cfg.available_features, ), ), diff --git a/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in b/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in deleted file mode 100644 index 9e725c523f29b..0000000000000 --- a/libcxxabi/test/configs/apple-libc++abi-backdeployment.cfg.in +++ /dev/null @@ -1,66 +0,0 @@ -# Testing configuration for back-deployment against older Apple system libc++abi. -# -# Under this configuration, we compile and link all the test suite against the latest libc++abi, -# however we run against the libc++abi on a different platform. This emulates the workflow of -# a developer building their application using recent tools but with the goal of deploying -# on existing devices running an older OS (and hence an older dylib). - -import os, site -site.addsitedir(os.path.join('@LIBCXXABI_LIBCXX_PATH@', 'utils')) -import libcxx.test.params, libcxx.test.config, libcxx.test.dsl - -lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') - -BACKDEPLOYMENT_PARAMETERS = [ - libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++) when running tests. - - This should be a directory hierarchy under which the libc++ dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++abi) when running tests. - - This should be a directory hierarchy under which the libc++abi dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)], - help=""" - The simulated root of the system (for libunwind) when running tests. - - This should be a directory hierarchy under which the libunwind dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), -] - -config.substitutions.append(('%{flags}', - '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' -)) -config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS ' + - '-I %{libcxx}/test/support -I %{libcxx}/src' -)) -config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++ -lc++abi' -)) -config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- ' -)) - -config.stdlib = 'apple-libc++' -config.using_system_stdlib = True - -libcxx.test.config.configure( - libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS, - libcxx.test.features.DEFAULT_FEATURES, - config, - lit_config -) diff --git a/libcxxabi/test/configs/apple-libc++abi-system.cfg.in b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in new file mode 100644 index 0000000000000..1e80eee4f8d0c --- /dev/null +++ b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in @@ -0,0 +1,34 @@ +# Testing configuration for back-deployment against the system-provided libc++abi. +# +# Under this configuration, we compile and link all the test suite against the just-built +# libc++abi, but we run against the system libc++abi. + +import os, site +site.addsitedir(os.path.join('@LIBCXXABI_LIBCXX_PATH@', 'utils')) +import libcxx.test.params, libcxx.test.config, libcxx.test.dsl + +lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') + +config.substitutions.append(('%{flags}', + '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' +)) +config.substitutions.append(('%{compile_flags}', + '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS ' + + '-I %{libcxx}/test/support -I %{libcxx}/src' +)) +config.substitutions.append(('%{link_flags}', + '-nostdlib++ -L %{lib} -lc++' +)) +config.substitutions.append(('%{exec}', + '%{executor} --execdir %T -- ' +)) + +config.stdlib = 'apple-libc++' +config.using_system_stdlib = True + +libcxx.test.config.configure( + libcxx.test.params.DEFAULT_PARAMETERS, + libcxx.test.features.DEFAULT_FEATURES, + config, + lit_config +) diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp index eb32b4679aff0..ad131bb3a8a7b 100644 --- a/libcxxabi/test/test_demangle.pass.cpp +++ b/libcxxabi/test/test_demangle.pass.cpp @@ -9,9 +9,8 @@ // This test is too big for most embedded devices. // XFAIL: LIBCXX-PICOLIBC-FIXME -// https://llvm.org/PR51407 was not fixed in some previously-released -// demanglers, which causes them to run into the infinite loop. -// UNSUPPORTED: using-built-library-before-llvm-14 +// This test exercises support for char array initializer lists added in dd8b266ef. +// UNSUPPORTED: using-built-library-before-llvm-20 // Android's long double on x86[-64] is (64/128)-bits instead of Linux's usual // 80-bit format, and this demangling test is failing on it. diff --git a/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in b/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in deleted file mode 100644 index 013c43ae78001..0000000000000 --- a/libunwind/test/configs/apple-libunwind-backdeployment.cfg.in +++ /dev/null @@ -1,67 +0,0 @@ -# Testing configuration for back-deployment against older Apple system libunwind. -# -# Under this configuration, we compile and link all the test suite against the latest libunwind, -# however we run against the libunwind on a different platform. This emulates the workflow of -# a developer building their application using recent tools but with the goal of deploying -# on existing devices running an older OS (and hence an older dylib). - -import os, site -site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils')) -import libcxx.test.params, libcxx.test.config, libcxx.test.dsl - -lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') - -BACKDEPLOYMENT_PARAMETERS = [ - libcxx.test.dsl.Parameter(name='cxx_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{cxx-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++) when running tests. - - This should be a directory hierarchy under which the libc++ dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='abi_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{abi-runtime-root}', root)], - help=""" - The simulated root of the system (for libc++abi) when running tests. - - This should be a directory hierarchy under which the libc++abi dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), - libcxx.test.dsl.Parameter(name='unwind_runtime_root', type=str, - actions=lambda root: [libcxx.test.dsl.AddSubstitution('%{unwind-runtime-root}', root)], - help=""" - The simulated root of the system (for libunwind) when running tests. - - This should be a directory hierarchy under which the libunwind dylib can be found. - The dylib in that hierarchy is the one that will be used at runtime when running - the tests. - """), -] - -config.substitutions.append(('%{flags}', - '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' -)) -config.substitutions.append(('%{compile_flags}', - '-nostdinc++ -I %{include}' -)) -config.substitutions.append(('%{link_flags}', - '-nostdlib++ -L %{lib} -lc++ -lc++abi -lunwind' -)) -config.substitutions.append(('%{exec}', - '%{executor} --execdir %T --env DYLD_LIBRARY_PATH="%{cxx-runtime-root}:%{abi-runtime-root}:%{unwind-runtime-root}" -- ' -)) - -config.stdlib = 'apple-libc++' -config.using_system_stdlib = True - -import os, site -import libcxx.test.params, libcxx.test.config -libcxx.test.config.configure( - libcxx.test.params.DEFAULT_PARAMETERS + BACKDEPLOYMENT_PARAMETERS, - libcxx.test.features.DEFAULT_FEATURES, - config, - lit_config -) diff --git a/libunwind/test/configs/apple-libunwind-system.cfg.in b/libunwind/test/configs/apple-libunwind-system.cfg.in new file mode 100644 index 0000000000000..e5a7c983562a6 --- /dev/null +++ b/libunwind/test/configs/apple-libunwind-system.cfg.in @@ -0,0 +1,41 @@ +# Testing configuration for back-deployment against the system-provided libunwind. +# +# Under this configuration, we compile and link all the test suite against the just-built +# libunwind, but we run against the system libunwind. + +import os, site +site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils')) +import libcxx.test.params, libcxx.test.config, libcxx.test.dsl + +lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg') + +config.substitutions.append(('%{flags}', + '-isysroot {}'.format('@CMAKE_OSX_SYSROOT@') if '@CMAKE_OSX_SYSROOT@' else '' +)) +config.substitutions.append(('%{compile_flags}', + '-nostdinc++ -I %{include}' +)) +config.substitutions.append(('%{link_flags}', + '-nostdlib++ -L %{lib} -lc++ -lunwind' +)) +config.substitutions.append(('%{exec}', + '%{executor} --execdir %T -- ' +)) + +config.stdlib = 'apple-libc++' +config.using_system_stdlib = True + +# TODO: This is a giant hack, but we need to change the install_name of libunwind.dylib because the +# upstream configuration can't currently produce a libunwind.dylib that is compatible with the +# Apple system one. +import subprocess +subprocess.check_call(['install_name_tool', '-id', '/usr/lib/system/libunwind.dylib', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/lib/libunwind.dylib']) + +import os, site +import libcxx.test.params, libcxx.test.config +libcxx.test.config.configure( + libcxx.test.params.DEFAULT_PARAMETERS, + libcxx.test.features.DEFAULT_FEATURES, + config, + lit_config +) From 725eb6bb12e7471149fb7362093deb6a710fe258 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 30 Sep 2024 22:11:21 +0100 Subject: [PATCH 026/151] [VPlan] Move createVPIRBasicBlock helper to VPIRBasicBlock (NFC). Move the helper to VPIRBasicBlock to allow easier re-use outside VPlan.cpp --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 13 +++++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6ddbfcf0ecfe5..4247d20cb0e53 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -863,10 +863,10 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } -static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) { - auto *VPIRBB = new VPIRBasicBlock(BB); +VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) { + auto *VPIRBB = new VPIRBasicBlock(IRBB); for (Instruction &I : - make_range(BB->begin(), BB->getTerminator()->getIterator())) + make_range(IRBB->begin(), IRBB->getTerminator()->getIterator())) VPIRBB->appendRecipe(new VPIRInstruction(I)); return VPIRBB; } @@ -875,7 +875,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) { - VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader()); + VPIRBasicBlock *Entry = + VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader()); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); auto Plan = std::make_unique(Entry, VecPreheader); @@ -915,7 +916,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); - auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock); + auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -991,7 +992,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRVPBB = createVPIRBasicBlockFor(IRBB); + VPIRBasicBlock *IRVPBB = VPIRBasicBlock::fromBasicBlock(IRBB); for (auto &R : make_early_inc_range(*VPBB)) { assert(!R.isPhi() && "Tried to move phi recipe to end of block"); R.moveBefore(*IRVPBB, IRVPBB->end()); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c4567362eaffc..8392aec8ad396 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3318,6 +3318,10 @@ class VPIRBasicBlock : public VPBasicBlock { return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; } + /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all + /// instructions in \p IRBB, except its terminator which is managed in VPlan. + static VPIRBasicBlock *fromBasicBlock(BasicBlock *IRBB); + /// The method which generates the output IR instructions that correspond to /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; From 78ccffc05336201c90e2c0bb2ae929ea3a6eec2b Mon Sep 17 00:00:00 2001 From: David Truby Date: Mon, 30 Sep 2024 22:40:16 +0100 Subject: [PATCH 027/151] [flang] Add MALLOC and FREE intrinsics for Cray pointers (#110018) MALLOC and FREE are extensions provided by gfortran, Intel Fortran and classic flang to allocate memory for Cray pointers. These are used in some legacy codes such as libexodus. All the above compilers accept using MALLOC and FREE with integers as well, despite that this will often signify a bug in user code. We should accept the same as the other compilers for compatibility. --- flang/docs/Intrinsics.md | 4 +- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + .../Optimizer/Builder/Runtime/Intrinsics.h | 4 + flang/include/flang/Runtime/extensions.h | 4 + flang/lib/Evaluate/intrinsics.cpp | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 15 ++++ .../Optimizer/Builder/Runtime/Intrinsics.cpp | 20 +++++ flang/lib/Semantics/check-call.cpp | 14 ++++ flang/runtime/extensions.cpp | 8 ++ flang/test/Lower/Intrinsics/free.f90 | 66 ++++++++++++++++ flang/test/Lower/Intrinsics/malloc.f90 | 75 +++++++++++++++++++ flang/test/Semantics/free.f90 | 33 ++++++++ 12 files changed, 245 insertions(+), 2 deletions(-) create mode 100644 flang/test/Lower/Intrinsics/free.f90 create mode 100644 flang/test/Lower/Intrinsics/malloc.f90 create mode 100644 flang/test/Semantics/free.f90 diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 87716731ead85..d6f48a7fd87d7 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -700,7 +700,7 @@ IBCHNG, ISHA, ISHC, ISHL, IXOR IARG, IARGC, NARGS, NUMARG BADDRESS, IADDR CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC -MALLOC +MALLOC, FREE ``` ### Library subroutine @@ -765,7 +765,7 @@ This phase currently supports all the intrinsic procedures listed above but the | Coarray intrinsic functions | COSHAPE | | Object characteristic inquiry functions | ALLOCATED, ASSOCIATED, EXTENDS_TYPE_OF, IS_CONTIGUOUS, PRESENT, RANK, SAME_TYPE, STORAGE_SIZE | | Type inquiry intrinsic functions | BIT_SIZE, DIGITS, EPSILON, HUGE, KIND, MAXEXPONENT, MINEXPONENT, NEW_LINE, PRECISION, RADIX, RANGE, TINY| -| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC | +| Non-standard intrinsic functions | AND, OR, XOR, SHIFT, ZEXT, IZEXT, COSD, SIND, TAND, ACOSD, ASIND, ATAND, ATAN2D, COMPL, EQV, NEQV, INT8, JINT, JNINT, KNINT, QCMPLX, DREAL, DFLOAT, QEXT, QFLOAT, QREAL, DNUM, NUM, JNUM, KNUM, QNUM, RNUM, RAN, RANF, ILEN, SIZEOF, MCLOCK, SECNDS, COTAN, IBCHNG, ISHA, ISHC, ISHL, IXOR, IARG, IARGC, NARGS, GETPID, NUMARG, BADDRESS, IADDR, CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, MALLOC, FREE | | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SIGNAL, SLEEP, SYSTEM, SYSTEM_CLOCK | | Atomic intrinsic subroutines | ATOMIC_ADD | | Collective intrinsic subroutines | CO_REDUCE | diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 78bb82b17d405..ca4030816b1a0 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -249,6 +249,7 @@ struct IntrinsicLibrary { mlir::Value genFloor(mlir::Type, llvm::ArrayRef); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef args); + void genFree(mlir::ArrayRef args); fir::ExtendedValue genGetCwd(std::optional resultType, llvm::ArrayRef args); void genGetCommand(mlir::ArrayRef args); @@ -315,6 +316,7 @@ struct IntrinsicLibrary { fir::ExtendedValue genLen(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genLenTrim(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genLoc(mlir::Type, llvm::ArrayRef); + mlir::Value genMalloc(mlir::Type, llvm::ArrayRef); template mlir::Value genMask(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 240de5a899d37..f62071a49e3bf 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -47,6 +47,10 @@ void genDateAndTime(fir::FirOpBuilder &, mlir::Location, void genEtime(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value values, mlir::Value time); +void genFree(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value ptr); +mlir::Value genMalloc(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value size); + void genRandomInit(fir::FirOpBuilder &, mlir::Location, mlir::Value repeatable, mlir::Value imageDistinct); void genRandomNumber(fir::FirOpBuilder &, mlir::Location, mlir::Value harvest); diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index fef651f3b2eed..8b7607be7e999 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -28,6 +28,8 @@ void FORTRAN_PROCEDURE_NAME(flush)(const int &unit); // GNU extension subroutine FDATE void FORTRAN_PROCEDURE_NAME(fdate)(char *string, std::int64_t length); +void RTNAME(Free)(std::intptr_t ptr); + // GNU Fortran 77 compatibility function IARGC. std::int32_t FORTRAN_PROCEDURE_NAME(iargc)(); @@ -38,6 +40,8 @@ void FORTRAN_PROCEDURE_NAME(getarg)( // GNU extension subroutine GETLOG(C). void FORTRAN_PROCEDURE_NAME(getlog)(char *name, std::int64_t length); +std::intptr_t RTNAME(Malloc)(std::size_t size); + // GNU extension function STATUS = SIGNAL(number, handler) std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int)); diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 17a09c080e72c..a89e9732228cb 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -620,6 +620,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"log10", {{"x", SameReal}}, SameReal}, {"logical", {{"l", AnyLogical}, DefaultingKIND}, KINDLogical}, {"log_gamma", {{"x", SameReal}}, SameReal}, + {"malloc", {{"size", AnyInt}}, SubscriptInt}, {"matmul", {{"matrix_a", AnyLogical, Rank::vector}, {"matrix_b", AnyLogical, Rank::matrix}}, @@ -1409,6 +1410,7 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"exit", {{"status", DefaultInt, Rank::scalar, Optionality::optional}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"free", {{"ptr", Addressable}}, {}}, {"get_command", {{"command", DefaultChar, Rank::scalar, Optionality::optional, common::Intent::Out}, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 4e6d92213c124..86f7d14c6592b 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -265,6 +265,7 @@ static constexpr IntrinsicHandler handlers[]{ /*isElemental=*/false}, {"floor", &I::genFloor}, {"fraction", &I::genFraction}, + {"free", &I::genFree}, {"get_command", &I::genGetCommand, {{{"command", asBox, handleDynamicOptional}, @@ -436,6 +437,7 @@ static constexpr IntrinsicHandler handlers[]{ {"lle", &I::genCharacterCompare}, {"llt", &I::genCharacterCompare}, {"loc", &I::genLoc, {{{"x", asBox}}}, /*isElemental=*/false}, + {"malloc", &I::genMalloc}, {"maskl", &I::genMask}, {"maskr", &I::genMask}, {"matmul", @@ -3581,6 +3583,12 @@ mlir::Value IntrinsicLibrary::genFraction(mlir::Type resultType, fir::runtime::genFraction(builder, loc, fir::getBase(args[0]))); } +void IntrinsicLibrary::genFree(llvm::ArrayRef args) { + assert(args.size() == 1); + + fir::runtime::genFree(builder, loc, fir::getBase(args[0])); +} + // GETCWD fir::ExtendedValue IntrinsicLibrary::genGetCwd(std::optional resultType, @@ -5307,6 +5315,13 @@ IntrinsicLibrary::genLoc(mlir::Type resultType, .getResults()[0]; } +mlir::Value IntrinsicLibrary::genMalloc(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 1); + return builder.createConvert(loc, resultType, + fir::runtime::genMalloc(builder, loc, args[0])); +} + // MASKL, MASKR template mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index aff3cadc3c300..cf2483d36c027 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -120,6 +120,26 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, runtimeFunc, args); } +void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value ptr) { + auto runtimeFunc = fir::runtime::getRuntimeFunc(loc, builder); + mlir::Type intPtrTy = builder.getIntPtrType(); + + builder.create(loc, runtimeFunc, + builder.createConvert(loc, intPtrTy, ptr)); +} + +mlir::Value fir::runtime::genMalloc(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value size) { + auto runtimeFunc = + fir::runtime::getRuntimeFunc(loc, builder); + auto argTy = runtimeFunc.getArgumentTypes()[0]; + return builder + .create(loc, runtimeFunc, + builder.createConvert(loc, argTy, size)) + .getResult(0); +} + void fir::runtime::genRandomInit(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value repeatable, mlir::Value imageDistinct) { diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp index 71d1c083c3127..31079174239c2 100644 --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -1600,6 +1600,18 @@ static void CheckMaxMin(const characteristics::Procedure &proc, } } +static void CheckFree(evaluate::ActualArguments &arguments, + parser::ContextualMessages &messages) { + if (arguments.size() != 1) { + messages.Say("FREE expects a single argument"_err_en_US); + } + auto arg = arguments[0]; + if (const Symbol * symbol{evaluate::UnwrapWholeSymbolDataRef(arg)}; + !symbol || !symbol->test(Symbol::Flag::CrayPointer)) { + messages.Say("FREE should only be used with Cray pointers"_warn_en_US); + } +} + // MOVE_ALLOC (F'2023 16.9.147) static void CheckMove_Alloc(evaluate::ActualArguments &arguments, parser::ContextualMessages &messages) { @@ -1885,6 +1897,8 @@ static void CheckSpecificIntrinsic(const characteristics::Procedure &proc, CheckReduce(arguments, context.foldingContext()); } else if (intrinsic.name == "transfer") { CheckTransfer(arguments, context, scope); + } else if (intrinsic.name == "free") { + CheckFree(arguments, context.foldingContext().messages()); } } diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp index be3833db88b07..4412a9cbeb6d2 100644 --- a/flang/runtime/extensions.cpp +++ b/flang/runtime/extensions.cpp @@ -96,6 +96,10 @@ void FORTRAN_PROCEDURE_NAME(fdate)(char *arg, std::int64_t length) { CopyAndPad(arg, str, length, 24); } +std::intptr_t RTNAME(Malloc)(std::size_t size) { + return reinterpret_cast(std::malloc(size)); +} + // RESULT = IARGC() std::int32_t FORTRAN_PROCEDURE_NAME(iargc)() { return RTNAME(ArgumentCount)(); } @@ -124,6 +128,10 @@ void FORTRAN_PROCEDURE_NAME(getlog)(char *arg, std::int64_t length) { #endif } +void RTNAME(Free)(std::intptr_t ptr) { + std::free(reinterpret_cast(ptr)); +} + std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int)) { // using auto for portability: // on Windows, this is a void * diff --git a/flang/test/Lower/Intrinsics/free.f90 b/flang/test/Lower/Intrinsics/free.f90 new file mode 100644 index 0000000000000..bb8d38e737aa7 --- /dev/null +++ b/flang/test/Lower/Intrinsics/free.f90 @@ -0,0 +1,66 @@ +! RUN: bbc -emit-hlfir %s -o - | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPfree_ptr() { +subroutine free_ptr() + integer :: x + pointer (ptr_x, x) + ! CHECK: %[[X:.*]] = fir.alloca !fir.box> + ! CHECK: %[[X_PTR:.*]] = fir.alloca i64 {bindc_name = "ptr_x", uniq_name = "_QFfree_ptrEptr_x"} + ! CHECK: %[[X_PTR_DECL:.*]]:2 = hlfir.declare %[[X_PTR]] {uniq_name = "_QFfree_ptrEptr_x"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFfree_ptrEx"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_PTR_DECL]]#0 : !fir.ref + ! CHECK: %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath : (i64) -> none + ! CHECK: return + call free(ptr_x) +end subroutine + +! gfortran allows free to be used on integers, so we accept it with a warning. + +! CHECK-LABEL: func.func @_QPfree_i8() { +subroutine free_i8 + integer (kind=1) :: x + ! CHECK: %[[X:.*]] = fir.alloca i8 {bindc_name = "x", uniq_name = "_QFfree_i8Ex"} + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i8Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref + ! CHECK: %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i8) -> i64 + ! CHECK: %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath : (i64) -> none + ! CHECK: return + call free(x) +end subroutine + + +! CHECK-LABEL: func.func @_QPfree_i16() { +subroutine free_i16 + integer (kind=2) :: x + ! CHECK: %[[X:.*]] = fir.alloca i16 {bindc_name = "x", uniq_name = "_QFfree_i16Ex"} + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i16Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref + ! CHECK: %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i16) -> i64 + ! CHECK: %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath : (i64) -> none + ! CHECK: return + call free(x) +end subroutine + +! CHECK-LABEL: func.func @_QPfree_i32() { +subroutine free_i32 + integer (kind=4) :: x + ! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFfree_i32Ex"} + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i32Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref + ! CHECK: %[[X_I64:.*]] = fir.convert %[[X_LD]] : (i32) -> i64 + ! CHECK: %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_I64]]) fastmath : (i64) -> none + ! CHECK: return + call free(x) +end subroutine + +! CHECK-LABEL: func.func @_QPfree_i64() { +subroutine free_i64 + integer (kind=8) :: x + ! CHECK: %[[X:.*]] = fir.alloca i64 {bindc_name = "x", uniq_name = "_QFfree_i64Ex"} + ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFfree_i64Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref + ! CHECK: %[[VOID:.*]] = fir.call @_FortranAFree(%[[X_LD]]) fastmath : (i64) -> none + ! CHECK: return + call free(x) +end subroutine diff --git a/flang/test/Lower/Intrinsics/malloc.f90 b/flang/test/Lower/Intrinsics/malloc.f90 new file mode 100644 index 0000000000000..4a9b65bf7ae18 --- /dev/null +++ b/flang/test/Lower/Intrinsics/malloc.f90 @@ -0,0 +1,75 @@ +! RUN: bbc -emit-hlfir %s -o - | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPmalloc_ptr() { +subroutine malloc_ptr() + integer :: x + pointer (ptr_x, x) + ! CHECK: %[[X:.*]] = fir.alloca !fir.box> + ! CHECK: %[[X_PTR:.*]] = fir.alloca i64 {bindc_name = "ptr_x", uniq_name = "_QFmalloc_ptrEptr_x"} + ! CHECK: %[[X_PTR_DECL:.*]]:2 = hlfir.declare %[[X_PTR]] {uniq_name = "_QFmalloc_ptrEptr_x"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[CST:.*]] = arith.constant 4 : i32 + ! CHECK: %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64 + ! CHECK: %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath : (i64) -> i64 + ! CHECK: hlfir.assign %[[ALLOC]] to %[[X_PTR_DECL]]#0 : i64, !fir.ref + ! CHECK: return + ptr_x = malloc(4) +end subroutine + +! gfortran allows malloc to be assigned to integers, so we accept it. + +! CHECK-LABEL: func.func @_QPmalloc_i8() { +subroutine malloc_i8() + integer(kind=1) :: x +! CHECK: %[[X:.*]] = fir.alloca i8 {bindc_name = "x", uniq_name = "_QFmalloc_i8Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i8Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[CST:.*]] = arith.constant 1 : i32 +! CHECK: %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64 +! CHECK: %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath : (i64) -> i64 +! CHECK: %[[ALLOC_I8:.*]] = fir.convert %[[ALLOC]] : (i64) -> i8 +! CHECK: hlfir.assign %[[ALLOC_I8]] to %[[X_DECL]]#0 : i8, !fir.ref +! CHECK: return + x = malloc(1) +end subroutine + +! CHECK-LABEL: func.func @_QPmalloc_i16() { +subroutine malloc_i16() + integer(kind=2) :: x +! CHECK: %[[X:.*]] = fir.alloca i16 {bindc_name = "x", uniq_name = "_QFmalloc_i16Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i16Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[CST:.*]] = arith.constant 1 : i32 +! CHECK: %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64 +! CHECK: %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath : (i64) -> i64 +! CHECK: %[[ALLOC_I16:.*]] = fir.convert %[[ALLOC]] : (i64) -> i16 +! CHECK: hlfir.assign %[[ALLOC_I16]] to %[[X_DECL]]#0 : i16, !fir.ref +! CHECK: return + x = malloc(1) +end subroutine + + +! CHECK-LABEL: func.func @_QPmalloc_i32() { +subroutine malloc_i32() + integer(kind=4) :: x +! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmalloc_i32Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i32Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[CST:.*]] = arith.constant 1 : i32 +! CHECK: %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64 +! CHECK: %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath : (i64) -> i64 +! CHECK: %[[ALLOC_I32:.*]] = fir.convert %[[ALLOC]] : (i64) -> i32 +! CHECK: hlfir.assign %[[ALLOC_I32]] to %[[X_DECL]]#0 : i32, !fir.ref +! CHECK: return + x = malloc(1) +end subroutine + +! CHECK-LABEL: func.func @_QPmalloc_i64() { +subroutine malloc_i64() + integer(kind=8) :: x +! CHECK: %[[X:.*]] = fir.alloca i64 {bindc_name = "x", uniq_name = "_QFmalloc_i64Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFmalloc_i64Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[CST:.*]] = arith.constant 1 : i32 +! CHECK: %[[CST_I64:.*]] = fir.convert %[[CST]] : (i32) -> i64 +! CHECK: %[[ALLOC:.*]] = fir.call @_FortranAMalloc(%[[CST_I64]]) fastmath : (i64) -> i64 +! CHECK: hlfir.assign %[[ALLOC]] to %[[X_DECL]]#0 : i64, !fir.ref +! CHECK: return + x = malloc(1) +end subroutine diff --git a/flang/test/Semantics/free.f90 b/flang/test/Semantics/free.f90 new file mode 100644 index 0000000000000..6332f03b19cd8 --- /dev/null +++ b/flang/test/Semantics/free.f90 @@ -0,0 +1,33 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror + +! Accept free of cray pointer without warning +subroutine free_cptr() + integer :: x + pointer(ptr_x, x) + call free(ptr_x) +end subroutine + +subroutine free_i8() + integer(kind=1) :: x + ! WARNING: FREE should only be used with Cray pointers + call free(x) +end subroutine + + +subroutine free_i16() + integer(kind=2) :: x + ! WARNING: FREE should only be used with Cray pointers + call free(x) +end subroutine + +subroutine free_i32() + integer(kind=4) :: x + ! WARNING: FREE should only be used with Cray pointers + call free(x) +end subroutine + +subroutine free_i64() + integer(kind=8) :: x + ! WARNING: FREE should only be used with Cray pointers + call free(x) +end subroutine From 4980f2177e5c1b68afc8249c52523cc0a38ecf1c Mon Sep 17 00:00:00 2001 From: BARRET <41060790+Adnios@users.noreply.github.com> Date: Tue, 1 Oct 2024 05:57:13 +0800 Subject: [PATCH 028/151] CMake: Remove unnecessary dependencies on LLVM/MLIR (#110362) There are some spurious libraries which can be removed. I'm trying to bundle MLIR/LLVM library dependencies for our own libraries. We're utilizing cmake function to recursively collect MLIR/LLVM related dependencies. However, we identified certain library dependencies as redundant and safe for removal. --- llvm/lib/MC/CMakeLists.txt | 1 - mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt | 1 - mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt | 2 -- mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt | 1 - mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt | 2 -- mlir/lib/Dialect/Func/Transforms/CMakeLists.txt | 2 -- mlir/lib/Dialect/GPU/CMakeLists.txt | 2 -- mlir/lib/Dialect/Linalg/IR/CMakeLists.txt | 2 -- mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt | 8 -------- mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt | 3 --- mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt | 2 -- mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt | 4 ---- mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt | 4 ---- mlir/lib/Dialect/Tosa/CMakeLists.txt | 3 --- mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt | 2 -- mlir/lib/Target/LLVM/CMakeLists.txt | 1 - mlir/lib/Target/LLVMIR/CMakeLists.txt | 2 -- mlir/lib/Target/SPIRV/CMakeLists.txt | 4 ---- mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt | 3 --- mlir/lib/Transforms/CMakeLists.txt | 1 - mlir/unittests/Target/LLVM/CMakeLists.txt | 1 + 21 files changed, 1 insertion(+), 50 deletions(-) diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index a089d2bff94f4..e1d19196c8766 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -79,7 +79,6 @@ add_llvm_component_library(LLVMMC Support TargetParser BinaryFormat - DebugInfoCodeView DEPENDS intrinsics_gen diff --git a/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt b/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt index df7e3f995303c..de3d850d520c0 100644 --- a/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt @@ -20,7 +20,6 @@ add_mlir_conversion_library(MLIRConvertToLLVMPass MLIRConversionPassIncGen LINK_LIBS PUBLIC - MLIRConvertToLLVMInterface MLIRIR MLIRLLVMCommonConversion MLIRLLVMDialect diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt index 35576732c82cf..27b5bf7eaa89f 100644 --- a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt @@ -34,8 +34,6 @@ add_mlir_conversion_library(MLIRVectorToLLVMPass LINK_LIBS PUBLIC MLIRVectorToLLVM - MLIRArmNeonDialect - MLIRArmSMEDialect MLIRArmSVEDialect MLIRArmSVETransforms MLIRAMXDialect diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt index 772f15335d907..607d4557e2f6a 100644 --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -32,7 +32,6 @@ add_mlir_dialect_library(MLIRAffineTransforms MLIRIR MLIRMemRefDialect MLIRPass - MLIRSCFUtils MLIRSideEffectInterfaces MLIRTensorDialect MLIRTransformUtils diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt index 6b8bde8dc2aaf..93a004d31916f 100644 --- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt @@ -20,8 +20,6 @@ add_mlir_dialect_library(MLIRArithTransforms LINK_LIBS PUBLIC MLIRAnalysis MLIRArithDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRFuncDialect MLIRFuncTransforms MLIRInferIntRangeInterface diff --git a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt index 172019907c3a8..6ba7aaaae903f 100644 --- a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt @@ -12,8 +12,6 @@ add_mlir_dialect_library(MLIRFuncTransforms MLIRFuncTransformsIncGen LINK_LIBS PUBLIC - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRFuncDialect MLIRIR MLIRMemRefDialect diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index a59645480aba2..a8a961282993a 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -67,9 +67,7 @@ add_mlir_dialect_library(MLIRGPUTransforms MLIRPass MLIRSCFDialect MLIRSideEffectInterfaces - MLIRSPIRVTarget MLIRSupport - MLIRROCDLTarget MLIRTransformUtils MLIRVectorDialect ) diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index c187563b8f0c4..f1fcb22fb8d54 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -25,14 +25,12 @@ add_mlir_dialect_library(MLIRLinalgDialect MLIRInferTypeOpInterface MLIRIR MLIRParser - MLIRShardingInterface MLIRSideEffectInterfaces MLIRSparseTensorDialect MLIRSCFDialect MLIRMathDialect MLIRMemRefDialect MLIRTensorDialect - MLIRTilingInterface MLIRValueBoundsOpInterface MLIRViewLikeInterface ) diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index 47af392def94a..06bac0d470d42 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -48,14 +48,11 @@ add_mlir_dialect_library(MLIRLinalgTransforms LINK_LIBS PUBLIC MLIRAffineDialect - MLIRAffineTransforms MLIRAffineUtils MLIRAnalysis MLIRArithDialect MLIRArithTransforms MLIRArithUtils - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRComplexDialect MLIRDestinationStyleOpInterface MLIRDialectUtils @@ -66,20 +63,15 @@ add_mlir_dialect_library(MLIRLinalgTransforms MLIRIR MLIRMemRefDialect MLIRMemRefTransforms - MLIRMeshDialect MLIRMeshTransforms MLIRLinalgDialect MLIRLinalgUtils MLIRSCFDialect MLIRSCFTransforms - MLIRSCFUtils MLIRPass - MLIRShardingInterface MLIRSubsetOpInterface MLIRSparseTensorDialect MLIRTensorDialect - MLIRTensorTilingInterfaceImpl - MLIRTensorTransforms MLIRTransforms MLIRTransformUtils MLIRValueBoundsOpInterface diff --git a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt index 4c7f42745ce8b..ecab97bc2b8e7 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt @@ -22,13 +22,10 @@ add_mlir_dialect_library(MLIRMemRefTransforms MLIRMemRefPassIncGen LINK_LIBS PUBLIC - MLIRAffineDialect MLIRAffineTransforms MLIRAffineUtils MLIRArithDialect MLIRArithTransforms - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRDialectUtils MLIRFuncDialect MLIRGPUDialect diff --git a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt index 212ea6d6948b2..d1d6261fb448d 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt @@ -23,8 +23,6 @@ add_mlir_dialect_library(MLIRMeshTransforms MLIRIR MLIRMeshDialect MLIRPass - MLIRShardingInterface MLIRSupport MLIRTensorDialect - MLIRTosaShardingInterfaceImpl ) diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt index 8c73515c608f5..fb877b5091492 100644 --- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt @@ -29,8 +29,6 @@ add_mlir_dialect_library(MLIRSCFTransforms MLIRAffineDialect MLIRAffineAnalysis MLIRArithDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRDestinationStyleOpInterface MLIRDialectUtils MLIRIR @@ -40,9 +38,7 @@ add_mlir_dialect_library(MLIRSCFTransforms MLIRSCFUtils MLIRSideEffectInterfaces MLIRSupport - MLIRTensorDialect MLIRTensorTransforms - MLIRTilingInterface MLIRTransforms MLIRTransformUtils ) diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt index ce32dea09bb0b..a6152ecc23b53 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -20,12 +20,9 @@ add_mlir_dialect_library(MLIRTensorTransforms LINK_LIBS PUBLIC MLIRAffineDialect - MLIRAffineTransforms MLIRAffineUtils MLIRArithDialect MLIRArithUtils - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRDialectUtils MLIRIR MLIRLinalgDialect @@ -35,7 +32,6 @@ add_mlir_dialect_library(MLIRTensorTransforms MLIRSubsetOpInterface MLIRTensorDialect MLIRTensorUtils - MLIRTilingInterface MLIRTransforms MLIRVectorDialect MLIRVectorUtils diff --git a/mlir/lib/Dialect/Tosa/CMakeLists.txt b/mlir/lib/Dialect/Tosa/CMakeLists.txt index 1911405c63cd5..4b72309a70c06 100644 --- a/mlir/lib/Dialect/Tosa/CMakeLists.txt +++ b/mlir/lib/Dialect/Tosa/CMakeLists.txt @@ -19,7 +19,6 @@ add_mlir_dialect_library(MLIRTosaDialect MLIRDialect MLIRCallInterfaces MLIRControlFlowInterfaces - MLIRQuantDialect MLIRQuantUtils MLIRSideEffectInterfaces MLIRTensorDialect @@ -35,10 +34,8 @@ add_mlir_dialect_library(MLIRTosaShardingInterfaceImpl LINK_LIBS PUBLIC MLIRIR - MLIRMeshDialect MLIRShardingInterface MLIRSupport - MLIRTosaDialect ) add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt index 2639a67e1c8b3..b7e8724c3c258 100644 --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -35,8 +35,6 @@ add_mlir_dialect_library(MLIRVectorTransforms MLIRAffineAnalysis MLIRAffineUtils MLIRArithDialect - MLIRBufferizationDialect - MLIRBufferizationTransforms MLIRDialectUtils MLIRGPUDialect MLIRIR diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt index bc14c568e46be..7e597b0f9bf94 100644 --- a/mlir/lib/Target/LLVM/CMakeLists.txt +++ b/mlir/lib/Target/LLVM/CMakeLists.txt @@ -125,7 +125,6 @@ add_mlir_dialect_library(MLIRROCDLTarget MLIRSupport MLIRGPUDialect MLIRTargetLLVM - MLIRROCDLToLLVMIRTranslation ) if(MLIR_ENABLE_ROCM_CONVERSIONS) diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt index 93032c3ce1038..4cc83edb0e961 100644 --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -38,7 +38,6 @@ add_mlir_translation_library(MLIRTargetLLVMIRExport MLIRDLTIDialect MLIRLLVMDialect MLIRLLVMIRTransforms - MLIRTranslateLib MLIRTransformUtils ) @@ -79,7 +78,6 @@ add_mlir_translation_library(MLIRTargetLLVMIRImport LINK_LIBS PUBLIC MLIRDLTIDialect MLIRLLVMDialect - MLIRTranslateLib ) add_mlir_translation_library(MLIRFromLLVMIRTranslationRegistration diff --git a/mlir/lib/Target/SPIRV/CMakeLists.txt b/mlir/lib/Target/SPIRV/CMakeLists.txt index 22d6d195a249e..2926320ed286b 100644 --- a/mlir/lib/Target/SPIRV/CMakeLists.txt +++ b/mlir/lib/Target/SPIRV/CMakeLists.txt @@ -12,7 +12,6 @@ add_mlir_translation_library(MLIRSPIRVBinaryUtils LINK_LIBS PUBLIC MLIRIR - MLIRSPIRVDialect MLIRSupport ) @@ -21,11 +20,9 @@ add_mlir_translation_library(MLIRSPIRVTranslateRegistration LINK_LIBS PUBLIC MLIRIR - MLIRSPIRVDialect MLIRSPIRVSerialization MLIRSPIRVDeserialization MLIRSupport - MLIRTranslateLib ) add_mlir_dialect_library(MLIRSPIRVTarget @@ -34,7 +31,6 @@ add_mlir_dialect_library(MLIRSPIRVTarget LINK_LIBS PUBLIC MLIRIR MLIRGPUDialect - MLIRSPIRVDialect MLIRSPIRVSerialization MLIRSupport ) diff --git a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt index 71580d8fcb978..036b97af4f9e7 100644 --- a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt +++ b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt @@ -8,10 +8,7 @@ add_mlir_translation_library(MLIRSPIRVSerialization LINK_LIBS PUBLIC MLIRIR - MLIRSPIRVDialect MLIRSPIRVBinaryUtils MLIRSupport MLIRTranslateLib ) - - diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index 90c0298fb5e46..058039e47313e 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -29,7 +29,6 @@ add_mlir_library(MLIRTransforms LINK_LIBS PUBLIC MLIRAnalysis - MLIRCopyOpInterface MLIRFunctionInterfaces MLIRLoopLikeInterface MLIRMemorySlotInterfaces diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt index 6d612548a94c0..5d50e98f1a2a4 100644 --- a/mlir/unittests/Target/LLVM/CMakeLists.txt +++ b/mlir/unittests/Target/LLVM/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(MLIRTargetLLVMTests MLIRNVVMToLLVMIRTranslation MLIRROCDLToLLVMIRTranslation MLIRGPUToLLVMIRTranslation + MLIRParser ${llvm_libs} ) From 96f37ae45310885e09195be09d9c05e1c1dff86b Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Mon, 30 Sep 2024 23:15:18 +0100 Subject: [PATCH 029/151] [NFC] Use initial-stack-allocations for more data structures (#110544) This replaces some of the most frequent offenders of using a DenseMap that cause a malloc, where the typical element-count is small enough to fit in an initial stack allocation. Most of these are fairly obvious, one to highlight is the collectOffset method of GEP instructions: if there's a GEP, of course it's going to have at least one offset, but every time we've called collectOffset we end up calling malloc as well for the DenseMap in the MapVector. --- llvm/include/llvm/IR/Instructions.h | 2 +- llvm/include/llvm/IR/Operator.h | 2 +- llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 3 ++- llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h | 4 ++-- llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 5 +++-- llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 2 +- llvm/lib/IR/Instructions.cpp | 2 +- llvm/lib/IR/Operator.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- .../AggressiveInstCombine/AggressiveInstCombine.cpp | 2 +- llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 2 +- llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 4 ++-- llvm/lib/Transforms/Scalar/GVN.cpp | 2 +- llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp | 2 +- llvm/lib/Transforms/Utils/Local.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 ++++++---- 17 files changed, 29 insertions(+), 25 deletions(-) diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 75a059760f48f..695a7a6aa9f25 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1117,7 +1117,7 @@ class GetElementPtrInst : public Instruction { /// the base GEP pointer. bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const; bool collectOffset(const DataLayout &DL, unsigned BitWidth, - MapVector &VariableOffsets, + SmallMapVector &VariableOffsets, APInt &ConstantOffset) const; // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Instruction *I) { diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h index 88b9bfc0be4b1..0e9f6ed35dcb4 100644 --- a/llvm/include/llvm/IR/Operator.h +++ b/llvm/include/llvm/IR/Operator.h @@ -528,7 +528,7 @@ class GEPOperator /// Collect the offset of this GEP as a map of Values to their associated /// APInt multipliers, as well as a total Constant Offset. bool collectOffset(const DataLayout &DL, unsigned BitWidth, - MapVector &VariableOffsets, + SmallMapVector &VariableOffsets, APInt &ConstantOffset) const; }; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 6f211abb299e7..aa44d62da47be 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2831,7 +2831,8 @@ static void emitRangeList( // Gather all the ranges that apply to the same section so they can share // a base address entry. - MapVector> SectionRanges; + SmallMapVector, 16> + SectionRanges; for (const auto &Range : R) SectionRanges[&Range.Begin->getSection()].push_back(&Range); diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index f157ffc6bcc2d..68db65ace9a42 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -1046,7 +1046,7 @@ class VLocTracker { /// transfer function for this block, as part of the dataflow analysis. The /// movement of values between locations inside of a block is handled at a /// much later stage, in the TransferTracker class. - MapVector Vars; + SmallMapVector Vars; SmallDenseMap Scopes; MachineBasicBlock *MBB = nullptr; const OverlapMap &OverlappingFragments; @@ -1128,7 +1128,7 @@ class InstrRefBasedLDV : public LDVImpl { /// Live in/out structure for the variable values: a per-block map of /// variables to their values. - using LiveIdxT = DenseMap; + using LiveIdxT = SmallDenseMap; using VarAndLoc = std::pair; diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 68dece6cf73e9..a0632eb17e65e 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -621,7 +621,8 @@ void ScheduleDAGInstrs::initSUnits() { } } -class ScheduleDAGInstrs::Value2SUsMap : public MapVector { +class ScheduleDAGInstrs::Value2SUsMap + : public SmallMapVector { /// Current total number of SUs in map. unsigned NumNodes = 0; @@ -656,7 +657,7 @@ class ScheduleDAGInstrs::Value2SUsMap : public MapVector { /// Clears map from all contents. void clear() { - MapVector::clear(); + SmallMapVector::clear(); NumNodes = 0; } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index e4ee3fd99f16e..9e5867c70d7b6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -183,7 +183,7 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes { // Hack to keep track of the inverse of FindCallSeqStart without more crazy // DAG crawling. - DenseMap CallSeqEndForStart; + SmallDenseMap CallSeqEndForStart; public: ScheduleDAGRRList(MachineFunction &mf, bool needlatency, diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index e95b98a640443..009e0c03957c9 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1584,7 +1584,7 @@ bool GetElementPtrInst::accumulateConstantOffset(const DataLayout &DL, bool GetElementPtrInst::collectOffset( const DataLayout &DL, unsigned BitWidth, - MapVector &VariableOffsets, + SmallMapVector &VariableOffsets, APInt &ConstantOffset) const { // Delegate to the generic GEPOperator implementation. return cast(this)->collectOffset(DL, BitWidth, VariableOffsets, diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp index 6c9862556f550..f93ff8f6fc8a2 100644 --- a/llvm/lib/IR/Operator.cpp +++ b/llvm/lib/IR/Operator.cpp @@ -201,7 +201,7 @@ bool GEPOperator::accumulateConstantOffset( bool GEPOperator::collectOffset( const DataLayout &DL, unsigned BitWidth, - MapVector &VariableOffsets, + SmallMapVector &VariableOffsets, APInt &ConstantOffset) const { assert(BitWidth == DL.getIndexSizeInBits(getPointerAddressSpace()) && "The offset bit width does not match DL specification."); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 7bd618b2d9660..24bfbff41ec5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -402,7 +402,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); - MapVector VarOffsets; + SmallMapVector VarOffsets; APInt ConstOffset(BW, 0); if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 01642b0677aba..9943c3cbb9fc7 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -843,7 +843,7 @@ getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL) { // Return a minimum gep stride, greatest common divisor of consective gep // index scales(c.f. Bézout's identity). while (auto *GEP = dyn_cast(PtrOp)) { - MapVector VarOffsets; + SmallMapVector VarOffsets; if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset)) break; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 416dd09ca874b..238bdf9c344b0 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1557,7 +1557,7 @@ bool AAPointerInfoFloating::collectConstantsForGEP(Attributor &A, const OffsetInfo &PtrOI, const GEPOperator *GEP) { unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); - MapVector VariableOffsets; + SmallMapVector VariableOffsets; APInt ConstantOffset(BitWidth, 0); assert(!UsrOI.isUnknown() && !PtrOI.isUnknown() && diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 7e2721d0c5a5e..7c06e0c757e1c 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -385,7 +385,7 @@ struct Decomposition { struct OffsetResult { Value *BasePtr; APInt ConstantOffset; - MapVector VariableOffsets; + SmallMapVector VariableOffsets; bool AllInbounds; OffsetResult() : BasePtr(nullptr), ConstantOffset(0, uint64_t(0)) {} @@ -410,7 +410,7 @@ static OffsetResult collectOffsets(GEPOperator &GEP, const DataLayout &DL) { // If we have a nested GEP, check if we can combine the constant offset of the // inner GEP with the outer GEP. if (auto *InnerGEP = dyn_cast(Result.BasePtr)) { - MapVector VariableOffsets2; + SmallMapVector VariableOffsets2; APInt ConstantOffset2(BitWidth, 0); bool CanCollectInner = InnerGEP->collectOffset( DL, BitWidth, VariableOffsets2, ConstantOffset2); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index db39d8621d077..2ba600497e00d 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -422,7 +422,7 @@ GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) { Type *PtrTy = GEP->getType()->getScalarType(); const DataLayout &DL = GEP->getDataLayout(); unsigned BitWidth = DL.getIndexTypeSizeInBits(PtrTy); - MapVector VariableOffsets; + SmallMapVector VariableOffsets; APInt ConstantOffset(BitWidth, 0); if (GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) { // Convert into offset representation, to recognize equivalent address diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp index 2a4f68e125252..7f99cd2060a9d 100644 --- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp +++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp @@ -56,7 +56,7 @@ static std::optional parseJumpTable(GetElementPtrInst *GEP, const DataLayout &DL = F.getDataLayout(); const unsigned BitWidth = DL.getIndexSizeInBits(GEP->getPointerAddressSpace()); - MapVector VariableOffsets; + SmallMapVector VariableOffsets; APInt ConstantOffset(BitWidth, 0); if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) return std::nullopt; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 7659fc6919615..cfe40f91f9a5d 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -925,7 +925,7 @@ CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ, } using PredBlockVector = SmallVector; -using IncomingValueMap = DenseMap; +using IncomingValueMap = SmallDenseMap; /// Determines the value to use as the phi node input for a block. /// @@ -2467,7 +2467,7 @@ Value *getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL, SmallVectorImpl &AdditionalValues) { unsigned BitWidth = DL.getIndexSizeInBits(GEP->getPointerAddressSpace()); // Rewrite a GEP into a DIExpression. - MapVector VariableOffsets; + SmallMapVector VariableOffsets; APInt ConstantOffset(BitWidth, 0); if (!GEP->collectOffset(DL, BitWidth, VariableOffsets, ConstantOffset)) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 034765bee40e7..f5ef50934f59f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5122,7 +5122,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the // instruction that is the key. - using IntervalMap = DenseMap; + using IntervalMap = SmallDenseMap; // Maps instruction to its index. SmallVector IdxToInstr; @@ -5165,7 +5165,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // Saves the list of intervals that end with the index in 'key'. using InstrList = SmallVector; - DenseMap TransposeEnds; + SmallDenseMap TransposeEnds; // Transpose the EndPoints to a list of values that end at each index. for (auto &Interval : EndPoint) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6b8ec55b30426..68bf5c52814f5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5470,7 +5470,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { } return I1 < I2; }; - DenseMap PhiToId; + SmallDenseMap PhiToId; SmallVector Phis(TE.Scalars.size()); std::iota(Phis.begin(), Phis.end(), 0); OrdersType ResOrder(TE.Scalars.size()); @@ -10319,7 +10319,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); if (E->CombinedOp != TreeEntry::NotCombinedOp) ShuffleOrOp = E->CombinedOp; - SetVector UniqueValues(VL.begin(), VL.end()); + SmallSetVector UniqueValues(VL.begin(), VL.end()); const unsigned Sz = UniqueValues.size(); SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { @@ -18013,7 +18013,7 @@ class HorizontalReduction { /// List of possibly reduced values. SmallVector> ReducedVals; /// Maps reduced value to the corresponding reduction operation. - DenseMap> ReducedValsToOps; + SmallDenseMap, 16> ReducedValsToOps; WeakTrackingVH ReductionRoot; /// The type of reduction operation. RecurKind RdxKind; @@ -18382,7 +18382,9 @@ class HorizontalReduction { // instruction op id and/or alternate op id, plus do extra analysis for // loads (grouping them by the distabce between pointers) and cmp // instructions (grouping them by the predicate). - MapVector>> + SmallMapVector< + size_t, SmallMapVector, 2>, + 8> PossibleReducedVals; initReductionOps(Root); DenseMap> LoadsMap; From c214af8454345a7986bce1395aad7f06b186352e Mon Sep 17 00:00:00 2001 From: vporpo Date: Mon, 30 Sep 2024 15:23:02 -0700 Subject: [PATCH 030/151] [SandboxVec][Interval] Implement intersection and difference operations (#110549) This patch implements a few set operations for the intervals. These include: - operator==() and operator!=() for comparing two intervals. - disjoint() - intersection() - difference, which uses operator-() --- .../Vectorize/SandboxVectorizer/Interval.h | 58 +++++++ .../SandboxVectorizer/IntervalTest.cpp | 158 ++++++++++++++++++ 2 files changed, 216 insertions(+) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h index d088c6c556f3a..8f25ad109f6a6 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Interval.h @@ -118,6 +118,64 @@ template class Interval { const_iterator end() const { return const_iterator(To != nullptr ? To->getNextNode() : nullptr, *this); } + /// Equality. + bool operator==(const Interval &Other) const { + return From == Other.From && To == Other.To; + } + /// Inequality. + bool operator!=(const Interval &Other) const { return !(*this == Other); } + /// \Returns true if this and \p Other have nothing in common. + bool disjoint(const Interval &Other) const { + if (Other.empty()) + return true; + if (empty()) + return true; + return Other.To->comesBefore(From) || To->comesBefore(Other.From); + } + /// \Returns the intersection between this and \p Other. + // Example: + // |----| this + // |---| Other + // |-| this->getIntersection(Other) + Interval intersection(const Interval &Other) const { + if (empty()) + return *this; + if (Other.empty()) + return Interval(); + // 1. No overlap + // A---B this + // C--D Other + if (To->comesBefore(Other.From) || Other.To->comesBefore(From)) + return Interval(); + // 2. Overlap. + // A---B this + // C--D Other + auto NewFromI = From->comesBefore(Other.From) ? Other.From : From; + auto NewToI = To->comesBefore(Other.To) ? To : Other.To; + return Interval(NewFromI, NewToI); + } + /// Difference operation. This returns up to two intervals. + // Example: + // |--------| this + // |-| Other + // |-| |--| this - Other + SmallVector operator-(const Interval &Other) { + if (disjoint(Other)) + return {*this}; + if (Other.empty()) + return {*this}; + if (*this == Other) + return {Interval()}; + Interval Intersection = intersection(Other); + SmallVector Result; + // Part 1, skip if empty. + if (From != Intersection.From) + Result.emplace_back(From, Intersection.From->getPrevNode()); + // Part 2, skip if empty. + if (Intersection.To != To) + Result.emplace_back(Intersection.To->getNextNode(), To); + return Result; + } }; } // namespace llvm::sandboxir diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp index d463a61d5969b..a697ce7727a9b 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/IntervalTest.cpp @@ -12,6 +12,7 @@ #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" #include "llvm/Support/SourceMgr.h" +#include "gmock/gmock-matchers.h" #include "gtest/gtest.h" using namespace llvm; @@ -90,4 +91,161 @@ define void @foo(i8 %v0) { auto BBIt = BB->begin(); for (auto &I : Intvl) EXPECT_EQ(&I, &*BBIt++); + { + // Check equality. + EXPECT_TRUE(Empty == Empty); + EXPECT_FALSE(Empty == One); + EXPECT_TRUE(One == One); + sandboxir::Interval Intvl1(I0, I2); + sandboxir::Interval Intvl2(I0, I2); + EXPECT_TRUE(Intvl1 == Intvl1); + EXPECT_TRUE(Intvl1 == Intvl2); + } + { + // Check inequality. + EXPECT_FALSE(Empty != Empty); + EXPECT_TRUE(Empty != One); + EXPECT_FALSE(One != One); + sandboxir::Interval Intvl1(I0, I2); + sandboxir::Interval Intvl2(I0, I2); + EXPECT_FALSE(Intvl1 != Intvl1); + EXPECT_FALSE(Intvl1 != Intvl2); + } + { + // Check disjoint(). + EXPECT_TRUE(Empty.disjoint(Empty)); + EXPECT_TRUE(One.disjoint(Empty)); + EXPECT_TRUE(Empty.disjoint(One)); + sandboxir::Interval Intvl1(I0, I2); + sandboxir::Interval Intvl2(I1, Ret); + EXPECT_FALSE(Intvl1.disjoint(Intvl2)); + sandboxir::Interval Intvl3(I2, I2); + EXPECT_FALSE(Intvl1.disjoint(Intvl3)); + EXPECT_TRUE(Intvl1.disjoint(Empty)); + } +} + +// Helper function for returning a vector of instruction pointers from a range +// of references. +template +static SmallVector getPtrVec(RangeT Range) { + SmallVector PtrVec; + for (sandboxir::Instruction &I : Range) + PtrVec.push_back(&I); + return PtrVec; +} + +TEST_F(IntervalTest, Difference) { + parseIR(C, R"IR( +define void @foo(i8 %v0) { + %I0 = add i8 %v0, %v0 + %I1 = add i8 %v0, %v0 + %I2 = add i8 %v0, %v0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *I0 = &*It++; + auto *I1 = &*It++; + auto *I2 = &*It++; + auto *Ret = &*It++; + + { + // Check [I0,Ret] - [] + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval Empty; + auto Diffs = I0Ret - Empty; + EXPECT_EQ(Diffs.size(), 1u); + const sandboxir::Interval &Diff = Diffs[0]; + EXPECT_THAT(getPtrVec(Diff), testing::ElementsAre(I0, I1, I2, Ret)); + } + { + // Check [] - [I0,Ret] + sandboxir::Interval Empty; + sandboxir::Interval I0Ret(I0, Ret); + auto Diffs = Empty - I0Ret; + EXPECT_EQ(Diffs.size(), 1u); + const sandboxir::Interval &Diff = Diffs[0]; + EXPECT_TRUE(Diff.empty()); + } + { + // Check [I0,Ret] - [I0]. + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval I0I0(I0, I0); + auto Diffs = I0Ret - I0I0; + EXPECT_EQ(Diffs.size(), 1u); + const sandboxir::Interval &Diff = Diffs[0]; + EXPECT_THAT(getPtrVec(Diff), testing::ElementsAre(I1, I2, Ret)); + } + { + // Check [I0,Ret] - [I1]. + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval I1I1(I1, I1); + auto Diffs = I0Ret - I1I1; + EXPECT_EQ(Diffs.size(), 2u); + const sandboxir::Interval &Diff0 = Diffs[0]; + EXPECT_THAT(getPtrVec(Diff0), testing::ElementsAre(I0)); + const sandboxir::Interval &Diff1 = Diffs[1]; + EXPECT_THAT(getPtrVec(Diff1), testing::ElementsAre(I2, Ret)); + } +} + +TEST_F(IntervalTest, Intersection) { + parseIR(C, R"IR( +define void @foo(i8 %v0) { + %I0 = add i8 %v0, %v0 + %I1 = add i8 %v0, %v0 + %I2 = add i8 %v0, %v0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *I0 = &*It++; + auto *I1 = &*It++; + [[maybe_unused]] auto *I2 = &*It++; + auto *Ret = &*It++; + + { + // Check [I0,Ret] ^ [] + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval Empty; + auto Intersection = I0Ret.intersection(Empty); + EXPECT_TRUE(Intersection.empty()); + } + { + // Check [] ^ [I0,Ret] + sandboxir::Interval Empty; + sandboxir::Interval I0Ret(I0, Ret); + auto Intersection = Empty.intersection(I0Ret); + EXPECT_TRUE(Intersection.empty()); + } + { + // Check [I0,Ret] ^ [I0] + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval I0I0(I0, I0); + auto Intersection = I0Ret.intersection(I0I0); + EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I0)); + } + { + // Check [I0] ^ [I0,Ret] + sandboxir::Interval I0I0(I0, I0); + sandboxir::Interval I0Ret(I0, Ret); + auto Intersection = I0I0.intersection(I0Ret); + EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I0)); + } + { + // Check [I0,Ret] ^ [I1]. + sandboxir::Interval I0Ret(I0, Ret); + sandboxir::Interval I1I1(I1, I1); + auto Intersection = I0Ret.intersection(I1I1); + EXPECT_THAT(getPtrVec(Intersection), testing::ElementsAre(I1)); + } } From 27a8f00b2257c4dca40ca71a972970ae2fac308c Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 1 Oct 2024 07:17:57 +0900 Subject: [PATCH 031/151] [Bazel] Fixup for #110538, Rename SandboxIRValues.def to Values.def --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index eb87b6f7cef54..727b19e47ec82 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -1455,7 +1455,7 @@ cc_library( ]), hdrs = glob(["include/llvm/SandboxIR/*.h"]), copts = llvm_copts, - textual_hdrs = ["include/llvm/SandboxIR/SandboxIRValues.def"], + textual_hdrs = ["include/llvm/SandboxIR/Values.def"], deps = [ ":Analysis", ":Core", From 7b2346829f434c6411fff6ccdbb063758532f77e Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 30 Sep 2024 15:37:44 -0700 Subject: [PATCH 032/151] [WebAssembly] Use 'any' type in more cases in AsmTypeCheck (#110403) Now that we support 'any' type in the value stack in the checker, this uses it in more places. When an instruction pops multiple values, rather than popping in one by one and generating multiple error messages, it adds them to a vector and pops them at once. When the type to be popped is not clear, it pops 'any', at least makes sure there are correct number of values on the stack. So for example, in case of `table.fill`, which expects `[i32 t i32]` (where t is the type of the elements in the table), it pops them at once, generating an error message like ```console error: type mismatch, expected [i32, externref, i32] but got [...] ``` In case the table is invalid so we don't know the type, it tries to pop an 'any' instead, popping whatever value there is: ```console error: type mismatch, expected [i32, any, i32] but got [...] ``` Checks done on other instructions based on the register info are already popping and pushing types in vectors, after #110094: https://github.com/llvm/llvm-project/blob/a52251675f001115b225f57362d37e92b7355ef9/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp#L515-L536 This also pushes 'any' in case the type to push is unclear. For example, `local/global.set` pushes a value of the type specified in the local or global, but in case that local or global is invalid, we push 'any' instead, which will match with whatever type. The objective of all these is not to make one instruction's error propragate continuously into subsequent instructions. This also matches Wabt's behavior. This also renames `checkAndPopTypes` to just `popTypes`, to be consistent with a single-element version `popType`. `popType(s)` also does type checks. --- .../AsmParser/WebAssemblyAsmTypeCheck.cpp | 85 ++++++++++++------- .../AsmParser/WebAssemblyAsmTypeCheck.h | 16 ++-- .../test/MC/WebAssembly/type-checker-errors.s | 60 ++++++++----- 3 files changed, 103 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index 845bf3976c22b..6c71460201537 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -88,21 +88,21 @@ bool WebAssemblyAsmTypeCheck::match(StackType TypeA, StackType TypeB) { std::string WebAssemblyAsmTypeCheck::getTypesString(ArrayRef Types, size_t StartPos) { - SmallVector Reverse; + SmallVector TypeStrs; for (auto I = Types.size(); I > StartPos; I--) { if (std::get_if(&Types[I - 1])) - Reverse.push_back("any"); + TypeStrs.push_back("any"); else if (std::get_if(&Types[I - 1])) - Reverse.push_back("ref"); + TypeStrs.push_back("ref"); else - Reverse.push_back( + TypeStrs.push_back( WebAssembly::typeToString(std::get(Types[I - 1]))); } std::stringstream SS; SS << "["; bool First = true; - for (auto It = Reverse.rbegin(); It != Reverse.rend(); ++It) { + for (auto It = TypeStrs.rbegin(); It != TypeStrs.rend(); ++It) { if (!First) SS << ", "; SS << *It; @@ -159,15 +159,15 @@ bool WebAssemblyAsmTypeCheck::checkTypes(SMLoc ErrorLoc, getTypesString(Stack, StackStartPos)); } -bool WebAssemblyAsmTypeCheck::checkAndPopTypes(SMLoc ErrorLoc, - ArrayRef ValTypes, - bool ExactMatch) { - return checkAndPopTypes(ErrorLoc, valTypeToStackType(ValTypes), ExactMatch); +bool WebAssemblyAsmTypeCheck::popTypes(SMLoc ErrorLoc, + ArrayRef ValTypes, + bool ExactMatch) { + return popTypes(ErrorLoc, valTypeToStackType(ValTypes), ExactMatch); } -bool WebAssemblyAsmTypeCheck::checkAndPopTypes(SMLoc ErrorLoc, - ArrayRef Types, - bool ExactMatch) { +bool WebAssemblyAsmTypeCheck::popTypes(SMLoc ErrorLoc, + ArrayRef Types, + bool ExactMatch) { bool Error = checkTypes(ErrorLoc, Types, ExactMatch); auto NumPops = std::min(Stack.size(), Types.size()); for (size_t I = 0, E = NumPops; I != E; I++) @@ -176,7 +176,7 @@ bool WebAssemblyAsmTypeCheck::checkAndPopTypes(SMLoc ErrorLoc, } bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc, StackType Type) { - return checkAndPopTypes(ErrorLoc, {Type}, false); + return popTypes(ErrorLoc, {Type}); } bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) { @@ -207,7 +207,7 @@ bool WebAssemblyAsmTypeCheck::checkBr(SMLoc ErrorLoc, size_t Level) { StringRef("br: invalid depth ") + std::to_string(Level)); const SmallVector &Expected = BrStack[BrStack.size() - Level - 1]; - return checkTypes(ErrorLoc, Expected, false); + return checkTypes(ErrorLoc, Expected); return false; } @@ -216,13 +216,13 @@ bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc, bool PopVals) { BrStack.pop_back(); if (PopVals) - return checkAndPopTypes(ErrorLoc, LastSig.Returns, false); - return checkTypes(ErrorLoc, LastSig.Returns, false); + return popTypes(ErrorLoc, LastSig.Returns); + return checkTypes(ErrorLoc, LastSig.Returns); } bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig) { - bool Error = checkAndPopTypes(ErrorLoc, Sig.Params, false); + bool Error = popTypes(ErrorLoc, Sig.Params); pushTypes(Sig.Returns); return Error; } @@ -309,7 +309,7 @@ bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc, } bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc, bool ExactMatch) { - bool Error = checkAndPopTypes(ErrorLoc, ReturnTypes, ExactMatch); + bool Error = popTypes(ErrorLoc, ReturnTypes, ExactMatch); Unreachable = true; return Error; } @@ -326,12 +326,14 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, pushType(Type); return false; } + pushType(Any{}); return true; } if (Name == "local.set") { if (!getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return popType(ErrorLoc, Type); + popType(ErrorLoc, Any{}); return true; } @@ -341,6 +343,8 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, pushType(Type); return Error; } + popType(ErrorLoc, Any{}); + pushType(Any{}); return true; } @@ -349,12 +353,14 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, pushType(Type); return false; } + pushType(Any{}); return true; } if (Name == "global.set") { if (!getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return popType(ErrorLoc, Type); + popType(ErrorLoc, Any{}); return true; } @@ -364,16 +370,21 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, pushType(Type); return Error; } + pushType(Any{}); return true; } if (Name == "table.set") { bool Error = false; - if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) - Error |= popType(ErrorLoc, Type); - else + SmallVector PopTypes; + PopTypes.push_back(wasm::ValType::I32); + if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) { + PopTypes.push_back(Type); + } else { Error = true; - Error |= popType(ErrorLoc, wasm::ValType::I32); + PopTypes.push_back(Any{}); + } + Error |= popTypes(ErrorLoc, PopTypes); return Error; } @@ -384,22 +395,32 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, } if (Name == "table.grow") { - bool Error = popType(ErrorLoc, wasm::ValType::I32); - if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) - Error |= popType(ErrorLoc, Type); - else + bool Error = false; + SmallVector PopTypes; + if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) { + PopTypes.push_back(Type); + } else { Error = true; + PopTypes.push_back(Any{}); + } + PopTypes.push_back(wasm::ValType::I32); + Error |= popTypes(ErrorLoc, PopTypes); pushType(wasm::ValType::I32); return Error; } if (Name == "table.fill") { - bool Error = popType(ErrorLoc, wasm::ValType::I32); - if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) - Error |= popType(ErrorLoc, Type); - else + bool Error = false; + SmallVector PopTypes; + PopTypes.push_back(wasm::ValType::I32); + if (!getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) { + PopTypes.push_back(Type); + } else { Error = true; - Error |= popType(ErrorLoc, wasm::ValType::I32); + PopTypes.push_back(Any{}); + } + PopTypes.push_back(wasm::ValType::I32); + Error |= popTypes(ErrorLoc, PopTypes); return Error; } @@ -525,7 +546,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, if (Op.OperandType == MCOI::OPERAND_REGISTER) PopTypes.push_back(WebAssembly::regClassToValType(Op.RegClass)); } - bool Error = checkAndPopTypes(ErrorLoc, PopTypes, false); + bool Error = popTypes(ErrorLoc, PopTypes); SmallVector PushTypes; // Now push all the defs onto the stack. for (unsigned I = 0; I < II.getNumDefs(); I++) { diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h index 9fd35a26f30e5..df063d749e3b4 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h @@ -40,17 +40,21 @@ class WebAssemblyAsmTypeCheck final { bool Unreachable = false; bool Is64; + // checkTypes checks 'Types' against the value stack. popTypes checks 'Types' + // against the value stack and also pops them. + // // If ExactMatch is true, 'Types' will be compared against not only the top of // the value stack but the whole remaining value stack // (TODO: This should be the whole remaining value stack "at the the current // block level", which has not been implemented yet) bool checkTypes(SMLoc ErrorLoc, ArrayRef Types, - bool ExactMatch); - bool checkTypes(SMLoc ErrorLoc, ArrayRef Types, bool ExactMatch); - bool checkAndPopTypes(SMLoc ErrorLoc, ArrayRef Types, - bool ExactMatch); - bool checkAndPopTypes(SMLoc ErrorLoc, ArrayRef Types, - bool ExactMatch); + bool ExactMatch = false); + bool checkTypes(SMLoc ErrorLoc, ArrayRef Types, + bool ExactMatch = false); + bool popTypes(SMLoc ErrorLoc, ArrayRef Types, + bool ExactMatch = false); + bool popTypes(SMLoc ErrorLoc, ArrayRef Types, + bool ExactMatch = false); bool popType(SMLoc ErrorLoc, StackType Type); bool popRefType(SMLoc ErrorLoc); bool popAnyType(SMLoc ErrorLoc); diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s index 5fdc2f56daf57..d81c5aff0a7e9 100644 --- a/llvm/test/MC/WebAssembly/type-checker-errors.s +++ b/llvm/test/MC/WebAssembly/type-checker-errors.s @@ -139,15 +139,14 @@ table_set_missing_tabletype: table_set_empty_stack_while_popping_1: .functype table_set_empty_stack_while_popping_1 () -> () -# CHECK: :[[@LINE+2]]:3: error: type mismatch, expected [externref] but got [] -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [] table.set valid_table end_function table_set_empty_stack_while_popping_2: .functype table_set_empty_stack_while_popping_2 (externref) -> () local.get 0 -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [externref] table.set valid_table end_function @@ -155,7 +154,7 @@ table_set_type_mismatch_1: .functype table_set_type_mismatch_1 () -> () i32.const 0 ref.null_func -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [externref] but got [funcref] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [i32, funcref] table.set valid_table end_function @@ -163,7 +162,7 @@ table_set_type_mismatch_2: .functype table_set_type_mismatch_2 () -> () f32.const 1.0 ref.null_extern -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref] but got [f32, externref] table.set valid_table end_function @@ -187,17 +186,14 @@ table_fill_missing_tabletype: table_fill_empty_stack_while_popping_1: .functype table_fill_empty_stack_while_popping_1 () -> () -# CHECK: :[[@LINE+3]]:3: error: type mismatch, expected [i32] but got [] -# CHECK: :[[@LINE+2]]:3: error: type mismatch, expected [externref] but got [] -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [] table.fill valid_table end_function table_fill_empty_stack_while_popping_2: .functype table_fill_empty_stack_while_popping_2 (i32) -> () local.get 0 -# CHECK: :[[@LINE+2]]:3: error: type mismatch, expected [externref] but got [] -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32] table.fill valid_table end_function @@ -205,7 +201,7 @@ table_fill_empty_stack_while_popping_3: .functype table_fill_empty_stack_while_popping_3 (i32, externref) -> () local.get 1 local.get 0 -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [externref, i32] table.fill valid_table end_function @@ -214,7 +210,7 @@ table_fill_type_mismatch_1: i32.const 0 ref.null_extern ref.null_func -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [funcref] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, externref, funcref] table.fill valid_table end_function @@ -223,7 +219,7 @@ table_fill_type_mismatch_2: i32.const 0 ref.null_func i32.const 1 -# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref] but got [funcref] +# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, funcref, i32] table.fill valid_table end_function @@ -232,7 +228,7 @@ table_fill_type_mismatch_3: f32.const 2.0 ref.null_extern i32.const 1 -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [f32] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [f32, externref, i32] table.fill valid_table end_function @@ -241,7 +237,7 @@ table_fill_type_mismatch_4: i32.const 1 ref.null_exn i32.const 1 -# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref] but got [exnref] +# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [i32, exnref, i32] table.fill valid_table end_function @@ -256,7 +252,7 @@ table_grow_non_exist_table: table_grow_type_mismatch_1: .functype table_grow_type_mismatch_1 (externref, i32) -> (i32) local.get 1 -# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref] but got [] +# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref, i32] but got [i32] table.grow valid_table end_function @@ -264,7 +260,7 @@ table_grow_type_mismatch_2: .functype table_grow_type_mismatch_2 (externref, i32) -> (i32) local.get 0 local.get 0 -# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [i32] but got [externref] +# CHECK: [[@LINE+1]]:3: error: type mismatch, expected [externref, i32] but got [externref, externref] table.grow valid_table end_function @@ -883,9 +879,7 @@ multiple_errors_in_function: # CHECK: :[[@LINE+1]]:13: error: expected expression operand table.get 1 -# CHECK: :[[@LINE+3]]:3: error: type mismatch, expected [i32] but got [] -# CHECK: :[[@LINE+2]]:3: error: type mismatch, expected [externref] but got [] -# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [] +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32, externref, i32] but got [any] table.fill valid_table f32.const 0.0 @@ -905,3 +899,29 @@ call_with_multi_param_and_return: call take_and_return_multi # CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [i32] but got [i32, i64, f32, f64] end_function + +.functype callee (f32, i32) -> () + +any_value_on_stack: + .functype any_value_on_stack () -> () + # This local does not exist so it should error out, but it should put an 'any' + # value on the stack so 'call callee' should not error out again +# CHECK: :[[@LINE+1]]:13: error: no local type specified for index 0 + local.get 0 + i32.const 0 +# CHECK-NOT: :[[@LINE+1]]:3: error: type mismatch + call callee + + # But this time 'call callee' should error out + i32.const 0 +# CHECK: :[[@LINE+1]]:13: error: no local type specified for index 0 + local.get 0 +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [f32, i32] but got [i32, any] + call callee + +# CHECK: :[[@LINE+2]]:13: error: no local type specified for index 0 +# CHECK: :[[@LINE+1]]:3: error: type mismatch, expected [any] but got [] + local.set 0 + drop + + end_function From 8b47711e844bce7d2b02022a0e190b9dcd3e50c4 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 1 Oct 2024 00:44:21 +0200 Subject: [PATCH 033/151] Revert "CMake: Remove unnecessary dependencies on LLVM/MLIR" (#110594) Reverts llvm/llvm-project#110362 Multiple bots are broken. --- llvm/lib/MC/CMakeLists.txt | 1 + mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt | 1 + mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt | 2 ++ mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt | 1 + mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt | 2 ++ mlir/lib/Dialect/Func/Transforms/CMakeLists.txt | 2 ++ mlir/lib/Dialect/GPU/CMakeLists.txt | 2 ++ mlir/lib/Dialect/Linalg/IR/CMakeLists.txt | 2 ++ mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt | 8 ++++++++ mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt | 3 +++ mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt | 2 ++ mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt | 4 ++++ mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt | 4 ++++ mlir/lib/Dialect/Tosa/CMakeLists.txt | 3 +++ mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt | 2 ++ mlir/lib/Target/LLVM/CMakeLists.txt | 1 + mlir/lib/Target/LLVMIR/CMakeLists.txt | 2 ++ mlir/lib/Target/SPIRV/CMakeLists.txt | 4 ++++ mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt | 3 +++ mlir/lib/Transforms/CMakeLists.txt | 1 + mlir/unittests/Target/LLVM/CMakeLists.txt | 1 - 21 files changed, 50 insertions(+), 1 deletion(-) diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index e1d19196c8766..a089d2bff94f4 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -79,6 +79,7 @@ add_llvm_component_library(LLVMMC Support TargetParser BinaryFormat + DebugInfoCodeView DEPENDS intrinsics_gen diff --git a/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt b/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt index de3d850d520c0..df7e3f995303c 100644 --- a/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt @@ -20,6 +20,7 @@ add_mlir_conversion_library(MLIRConvertToLLVMPass MLIRConversionPassIncGen LINK_LIBS PUBLIC + MLIRConvertToLLVMInterface MLIRIR MLIRLLVMCommonConversion MLIRLLVMDialect diff --git a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt index 27b5bf7eaa89f..35576732c82cf 100644 --- a/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/VectorToLLVM/CMakeLists.txt @@ -34,6 +34,8 @@ add_mlir_conversion_library(MLIRVectorToLLVMPass LINK_LIBS PUBLIC MLIRVectorToLLVM + MLIRArmNeonDialect + MLIRArmSMEDialect MLIRArmSVEDialect MLIRArmSVETransforms MLIRAMXDialect diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt index 607d4557e2f6a..772f15335d907 100644 --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -32,6 +32,7 @@ add_mlir_dialect_library(MLIRAffineTransforms MLIRIR MLIRMemRefDialect MLIRPass + MLIRSCFUtils MLIRSideEffectInterfaces MLIRTensorDialect MLIRTransformUtils diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt index 93a004d31916f..6b8bde8dc2aaf 100644 --- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt @@ -20,6 +20,8 @@ add_mlir_dialect_library(MLIRArithTransforms LINK_LIBS PUBLIC MLIRAnalysis MLIRArithDialect + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRFuncDialect MLIRFuncTransforms MLIRInferIntRangeInterface diff --git a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt index 6ba7aaaae903f..172019907c3a8 100644 --- a/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Func/Transforms/CMakeLists.txt @@ -12,6 +12,8 @@ add_mlir_dialect_library(MLIRFuncTransforms MLIRFuncTransformsIncGen LINK_LIBS PUBLIC + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRFuncDialect MLIRIR MLIRMemRefDialect diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index a8a961282993a..a59645480aba2 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -67,7 +67,9 @@ add_mlir_dialect_library(MLIRGPUTransforms MLIRPass MLIRSCFDialect MLIRSideEffectInterfaces + MLIRSPIRVTarget MLIRSupport + MLIRROCDLTarget MLIRTransformUtils MLIRVectorDialect ) diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index f1fcb22fb8d54..c187563b8f0c4 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -25,12 +25,14 @@ add_mlir_dialect_library(MLIRLinalgDialect MLIRInferTypeOpInterface MLIRIR MLIRParser + MLIRShardingInterface MLIRSideEffectInterfaces MLIRSparseTensorDialect MLIRSCFDialect MLIRMathDialect MLIRMemRefDialect MLIRTensorDialect + MLIRTilingInterface MLIRValueBoundsOpInterface MLIRViewLikeInterface ) diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index 06bac0d470d42..47af392def94a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -48,11 +48,14 @@ add_mlir_dialect_library(MLIRLinalgTransforms LINK_LIBS PUBLIC MLIRAffineDialect + MLIRAffineTransforms MLIRAffineUtils MLIRAnalysis MLIRArithDialect MLIRArithTransforms MLIRArithUtils + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRComplexDialect MLIRDestinationStyleOpInterface MLIRDialectUtils @@ -63,15 +66,20 @@ add_mlir_dialect_library(MLIRLinalgTransforms MLIRIR MLIRMemRefDialect MLIRMemRefTransforms + MLIRMeshDialect MLIRMeshTransforms MLIRLinalgDialect MLIRLinalgUtils MLIRSCFDialect MLIRSCFTransforms + MLIRSCFUtils MLIRPass + MLIRShardingInterface MLIRSubsetOpInterface MLIRSparseTensorDialect MLIRTensorDialect + MLIRTensorTilingInterfaceImpl + MLIRTensorTransforms MLIRTransforms MLIRTransformUtils MLIRValueBoundsOpInterface diff --git a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt index ecab97bc2b8e7..4c7f42745ce8b 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt @@ -22,10 +22,13 @@ add_mlir_dialect_library(MLIRMemRefTransforms MLIRMemRefPassIncGen LINK_LIBS PUBLIC + MLIRAffineDialect MLIRAffineTransforms MLIRAffineUtils MLIRArithDialect MLIRArithTransforms + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRDialectUtils MLIRFuncDialect MLIRGPUDialect diff --git a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt index d1d6261fb448d..212ea6d6948b2 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Mesh/Transforms/CMakeLists.txt @@ -23,6 +23,8 @@ add_mlir_dialect_library(MLIRMeshTransforms MLIRIR MLIRMeshDialect MLIRPass + MLIRShardingInterface MLIRSupport MLIRTensorDialect + MLIRTosaShardingInterfaceImpl ) diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt index fb877b5091492..8c73515c608f5 100644 --- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt @@ -29,6 +29,8 @@ add_mlir_dialect_library(MLIRSCFTransforms MLIRAffineDialect MLIRAffineAnalysis MLIRArithDialect + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRDestinationStyleOpInterface MLIRDialectUtils MLIRIR @@ -38,7 +40,9 @@ add_mlir_dialect_library(MLIRSCFTransforms MLIRSCFUtils MLIRSideEffectInterfaces MLIRSupport + MLIRTensorDialect MLIRTensorTransforms + MLIRTilingInterface MLIRTransforms MLIRTransformUtils ) diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt index a6152ecc23b53..ce32dea09bb0b 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -20,9 +20,12 @@ add_mlir_dialect_library(MLIRTensorTransforms LINK_LIBS PUBLIC MLIRAffineDialect + MLIRAffineTransforms MLIRAffineUtils MLIRArithDialect MLIRArithUtils + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRDialectUtils MLIRIR MLIRLinalgDialect @@ -32,6 +35,7 @@ add_mlir_dialect_library(MLIRTensorTransforms MLIRSubsetOpInterface MLIRTensorDialect MLIRTensorUtils + MLIRTilingInterface MLIRTransforms MLIRVectorDialect MLIRVectorUtils diff --git a/mlir/lib/Dialect/Tosa/CMakeLists.txt b/mlir/lib/Dialect/Tosa/CMakeLists.txt index 4b72309a70c06..1911405c63cd5 100644 --- a/mlir/lib/Dialect/Tosa/CMakeLists.txt +++ b/mlir/lib/Dialect/Tosa/CMakeLists.txt @@ -19,6 +19,7 @@ add_mlir_dialect_library(MLIRTosaDialect MLIRDialect MLIRCallInterfaces MLIRControlFlowInterfaces + MLIRQuantDialect MLIRQuantUtils MLIRSideEffectInterfaces MLIRTensorDialect @@ -34,8 +35,10 @@ add_mlir_dialect_library(MLIRTosaShardingInterfaceImpl LINK_LIBS PUBLIC MLIRIR + MLIRMeshDialect MLIRShardingInterface MLIRSupport + MLIRTosaDialect ) add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt index b7e8724c3c258..2639a67e1c8b3 100644 --- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt @@ -35,6 +35,8 @@ add_mlir_dialect_library(MLIRVectorTransforms MLIRAffineAnalysis MLIRAffineUtils MLIRArithDialect + MLIRBufferizationDialect + MLIRBufferizationTransforms MLIRDialectUtils MLIRGPUDialect MLIRIR diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt index 7e597b0f9bf94..bc14c568e46be 100644 --- a/mlir/lib/Target/LLVM/CMakeLists.txt +++ b/mlir/lib/Target/LLVM/CMakeLists.txt @@ -125,6 +125,7 @@ add_mlir_dialect_library(MLIRROCDLTarget MLIRSupport MLIRGPUDialect MLIRTargetLLVM + MLIRROCDLToLLVMIRTranslation ) if(MLIR_ENABLE_ROCM_CONVERSIONS) diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt index 4cc83edb0e961..93032c3ce1038 100644 --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -38,6 +38,7 @@ add_mlir_translation_library(MLIRTargetLLVMIRExport MLIRDLTIDialect MLIRLLVMDialect MLIRLLVMIRTransforms + MLIRTranslateLib MLIRTransformUtils ) @@ -78,6 +79,7 @@ add_mlir_translation_library(MLIRTargetLLVMIRImport LINK_LIBS PUBLIC MLIRDLTIDialect MLIRLLVMDialect + MLIRTranslateLib ) add_mlir_translation_library(MLIRFromLLVMIRTranslationRegistration diff --git a/mlir/lib/Target/SPIRV/CMakeLists.txt b/mlir/lib/Target/SPIRV/CMakeLists.txt index 2926320ed286b..22d6d195a249e 100644 --- a/mlir/lib/Target/SPIRV/CMakeLists.txt +++ b/mlir/lib/Target/SPIRV/CMakeLists.txt @@ -12,6 +12,7 @@ add_mlir_translation_library(MLIRSPIRVBinaryUtils LINK_LIBS PUBLIC MLIRIR + MLIRSPIRVDialect MLIRSupport ) @@ -20,9 +21,11 @@ add_mlir_translation_library(MLIRSPIRVTranslateRegistration LINK_LIBS PUBLIC MLIRIR + MLIRSPIRVDialect MLIRSPIRVSerialization MLIRSPIRVDeserialization MLIRSupport + MLIRTranslateLib ) add_mlir_dialect_library(MLIRSPIRVTarget @@ -31,6 +34,7 @@ add_mlir_dialect_library(MLIRSPIRVTarget LINK_LIBS PUBLIC MLIRIR MLIRGPUDialect + MLIRSPIRVDialect MLIRSPIRVSerialization MLIRSupport ) diff --git a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt index 036b97af4f9e7..71580d8fcb978 100644 --- a/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt +++ b/mlir/lib/Target/SPIRV/Serialization/CMakeLists.txt @@ -8,7 +8,10 @@ add_mlir_translation_library(MLIRSPIRVSerialization LINK_LIBS PUBLIC MLIRIR + MLIRSPIRVDialect MLIRSPIRVBinaryUtils MLIRSupport MLIRTranslateLib ) + + diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index 058039e47313e..90c0298fb5e46 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_library(MLIRTransforms LINK_LIBS PUBLIC MLIRAnalysis + MLIRCopyOpInterface MLIRFunctionInterfaces MLIRLoopLikeInterface MLIRMemorySlotInterfaces diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt index 5d50e98f1a2a4..6d612548a94c0 100644 --- a/mlir/unittests/Target/LLVM/CMakeLists.txt +++ b/mlir/unittests/Target/LLVM/CMakeLists.txt @@ -19,7 +19,6 @@ target_link_libraries(MLIRTargetLLVMTests MLIRNVVMToLLVMIRTranslation MLIRROCDLToLLVMIRTranslation MLIRGPUToLLVMIRTranslation - MLIRParser ${llvm_libs} ) From f3a4def436618c24e2eb9faa812994beb2cd7744 Mon Sep 17 00:00:00 2001 From: David Tenty Date: Mon, 30 Sep 2024 19:24:32 -0400 Subject: [PATCH 034/151] [libcxx][ios] initialize __fill_val_ in _FillHelper (#110279) This is a small fix to https://github.com/llvm/llvm-project/pull/89305. In the `__init` function of `_FillHelper`, `__fill_val_` was left uninitialized. This worked for the implementation in the PR because we always checked `__set_` before trying to read it, and would initialize if it was unset. However it turns out in earlier versions of the header (at least on AIX which followed this path), we do a read of `__fill_val_` even if `__set_` was false before initializing, to check if it matched the sentinel value, so this causes undesired behaviour and UB. --- libcxx/include/ios | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libcxx/include/ios b/libcxx/include/ios index 61a05fadd29a1..d4f15a269a11a 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -524,7 +524,10 @@ template // Attribute 'packed' is used to keep the layout compatible with the previous // definition of the '__fill_' and '_set_' pair in basic_ios on AIX & z/OS. struct _LIBCPP_PACKED _FillHelper { - _LIBCPP_HIDE_FROM_ABI void __init() { __set_ = false; } + _LIBCPP_HIDE_FROM_ABI void __init() { + __set_ = false; + __fill_val_ = _Traits::eof(); + } _LIBCPP_HIDE_FROM_ABI _FillHelper& operator=(typename _Traits::int_type __x) { __set_ = true; __fill_val_ = __x; From 915df1ae41652e2f595ce741dcd8f01878ef4e30 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Tue, 1 Oct 2024 07:27:15 +0800 Subject: [PATCH 035/151] [Clang] Implement CWG 2707 "Deduction guides cannot have a trailing requires-clause" (#110473) Closes https://github.com/llvm/llvm-project/issues/98595 --- clang/docs/ReleaseNotes.rst | 3 ++ clang/include/clang/AST/DeclCXX.h | 9 ++++-- .../clang/Basic/DiagnosticSemaKinds.td | 2 -- clang/lib/AST/DeclCXX.cpp | 9 +++--- clang/lib/Sema/SemaDecl.cpp | 11 +++----- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 2 +- clang/test/CXX/dcl/dcl.decl/p3.cpp | 2 +- clang/test/CXX/drs/cwg27xx.cpp | 28 +++++++++++++++++++ clang/www/cxx_dr_status.html | 4 +-- 9 files changed, 50 insertions(+), 20 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 02dfbfaaea207..1681ae8049a73 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -205,6 +205,9 @@ Resolutions to C++ Defect Reports - Reject explicit object parameters with type ``void`` (``this void``). (`CWG2915: Explicit object parameters of type void `_). +- Clang now allows trailing requires clause on explicit deduction guides. + (`CWG2707: Deduction guides cannot have a trailing requires-clause `_). + C Language Changes ------------------ diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 252e6e9256414..2693cc0e95b4b 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -1965,9 +1965,11 @@ class CXXDeductionGuideDecl : public FunctionDecl { ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, SourceLocation EndLocation, - CXXConstructorDecl *Ctor, DeductionCandidate Kind) + CXXConstructorDecl *Ctor, DeductionCandidate Kind, + Expr *TrailingRequiresClause) : FunctionDecl(CXXDeductionGuide, C, DC, StartLoc, NameInfo, T, TInfo, - SC_None, false, false, ConstexprSpecKind::Unspecified), + SC_None, false, false, ConstexprSpecKind::Unspecified, + TrailingRequiresClause), Ctor(Ctor), ExplicitSpec(ES) { if (EndLocation.isValid()) setRangeEnd(EndLocation); @@ -1987,7 +1989,8 @@ class CXXDeductionGuideDecl : public FunctionDecl { ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor = nullptr, - DeductionCandidate Kind = DeductionCandidate::Normal); + DeductionCandidate Kind = DeductionCandidate::Normal, + Expr *TrailingRequiresClause = nullptr); static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 9e8f152852fd1..0f591022e6854 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3050,8 +3050,6 @@ def note_is_deducible_constraint_evaluated_to_false : Note< "cannot deduce template arguments for %0 from %1">; def err_constrained_virtual_method : Error< "virtual function cannot have a requires clause">; -def err_trailing_requires_clause_on_deduction_guide : Error< - "deduction guide cannot have a requires clause">; def err_constrained_non_templated_function : Error<"non-templated function cannot have a requires clause">; def err_non_temp_spec_requires_clause : Error< diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 01143391edab4..f5a0aa8f82512 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -2211,9 +2211,10 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create( ASTContext &C, DeclContext *DC, SourceLocation StartLoc, ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor, - DeductionCandidate Kind) { - return new (C, DC) CXXDeductionGuideDecl(C, DC, StartLoc, ES, NameInfo, T, - TInfo, EndLocation, Ctor, Kind); + DeductionCandidate Kind, Expr *TrailingRequiresClause) { + return new (C, DC) + CXXDeductionGuideDecl(C, DC, StartLoc, ES, NameInfo, T, TInfo, + EndLocation, Ctor, Kind, TrailingRequiresClause); } CXXDeductionGuideDecl * @@ -2221,7 +2222,7 @@ CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { return new (C, ID) CXXDeductionGuideDecl( C, nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(), QualType(), nullptr, SourceLocation(), nullptr, - DeductionCandidate::Normal); + DeductionCandidate::Normal, nullptr); } RequiresExprBodyDecl *RequiresExprBodyDecl::Create( diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 1bf0e800a3622..0e536f71a2f70 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -9293,15 +9293,12 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D, TrailingRequiresClause); } else if (Name.getNameKind() == DeclarationName::CXXDeductionGuideName) { - if (TrailingRequiresClause) - SemaRef.Diag(TrailingRequiresClause->getBeginLoc(), - diag::err_trailing_requires_clause_on_deduction_guide) - << TrailingRequiresClause->getSourceRange(); if (SemaRef.CheckDeductionGuideDeclarator(D, R, SC)) return nullptr; - return CXXDeductionGuideDecl::Create(SemaRef.Context, DC, D.getBeginLoc(), - ExplicitSpecifier, NameInfo, R, TInfo, - D.getEndLoc()); + return CXXDeductionGuideDecl::Create( + SemaRef.Context, DC, D.getBeginLoc(), ExplicitSpecifier, NameInfo, R, + TInfo, D.getEndLoc(), /*Ctor=*/nullptr, + /*Kind=*/DeductionCandidate::Normal, TrailingRequiresClause); } else if (DC->isRecord()) { // If the name of the function is the same as the name of the record, // then this must be an invalid constructor that has a return type. diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index c3cb9d5d8c2c3..1c35c7d288e32 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -2233,7 +2233,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl( SemaRef.Context, DC, D->getInnerLocStart(), InstantiatedExplicitSpecifier, NameInfo, T, TInfo, D->getSourceRange().getEnd(), DGuide->getCorrespondingConstructor(), - DGuide->getDeductionCandidateKind()); + DGuide->getDeductionCandidateKind(), TrailingRequiresClause); Function->setAccess(D->getAccess()); } else { Function = FunctionDecl::Create( diff --git a/clang/test/CXX/dcl/dcl.decl/p3.cpp b/clang/test/CXX/dcl/dcl.decl/p3.cpp index f141568ba6c22..b082e1c122a09 100644 --- a/clang/test/CXX/dcl/dcl.decl/p3.cpp +++ b/clang/test/CXX/dcl/dcl.decl/p3.cpp @@ -65,4 +65,4 @@ struct R { }; template -R(T) -> R requires true; // expected-error{{deduction guide cannot have a requires clause}} +R(T) -> R requires true; diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp index 2b57dbc60aed7..581e2af822d55 100644 --- a/clang/test/CXX/drs/cwg27xx.cpp +++ b/clang/test/CXX/drs/cwg27xx.cpp @@ -201,3 +201,31 @@ static_assert(false, f().s); #endif } // namespace cwg2798 +namespace cwg2707 { // cwg2707: 20 + +#if __cplusplus >= 202002L + +template struct A { + T value[N]; +}; + +template +A(T...) -> A requires (sizeof...(T) == 2); + +// Brace elision is not allowed for synthesized CTAD guides if the array size +// is value-dependent. +// So this should pick up our explicit deduction guide. +A a = {1, 2}; + +A b = {3, 4, 5}; +// expected-error@-1 {{no viable constructor or deduction guide}} \ +// expected-note@-13 {{candidate function template not viable}} \ +// expected-note@-13 {{implicit deduction guide}} \ +// expected-note@-8 {{constraints not satisfied}} \ +// expected-note@-8 {{because 'sizeof...(T) == 2' (3 == 2) evaluated to false}} \ +// expected-note@-13 {{candidate function template not viable}} \ +// expected-note@-13 {{implicit deduction guide}} + +#endif + +} // namespace cwg2707 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index e5c5e50104fda..978351716ce33 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -16089,7 +16089,7 @@

C++ defect report implementation status

2707 DRWP Deduction guides cannot have a trailing requires-clause - Unknown + Clang 20 2708 @@ -17334,7 +17334,7 @@

C++ defect report implementation status

2913 tentatively ready Grammar for deduction-guide has requires-clause in the wrong position - Not resolved + Clang 20 2914 From a5cd5d351ddb164d7bb5e6c5e20b2b6519d793f1 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Tue, 1 Oct 2024 09:17:58 +0900 Subject: [PATCH 036/151] [lld][WebAssembly] Avoid emitting empty __wasm_apply_data_relocs function (#109249) Instead of always generating __wasm_apply_data_relocs when relevant options like -pie and -shared are specified, generate it only when the relevant relocations are actually necessary. Note: omitting empty __wasm_apply_data_relocs is not a problem because the export is optional in the spec (DynamicLinking.md) and all runtime linker implementations I'm aware of implement it that way. (emscripten, toywasm, wasm-tools) Motivations: * This possibly reduces the module size * This is also a preparation to fix https://github.com/llvm/llvm-project/issues/107387, for which it isn't obvious if we need these relocations at the time of createSyntheticSymbols. (unless we introduce a new explicit option like --non-pie-dynamic-link.) --- lld/test/wasm/data-segments.ll | 9 +-------- lld/test/wasm/shared-weak-symbols.s | 15 +++++---------- lld/test/wasm/tls-export.s | 3 --- lld/test/wasm/tls-non-shared-memory.s | 3 --- lld/test/wasm/tls-relocations.s | 2 +- lld/wasm/Driver.cpp | 11 ----------- lld/wasm/InputChunks.cpp | 10 ++++++++-- lld/wasm/InputChunks.h | 2 +- lld/wasm/Symbols.cpp | 1 - lld/wasm/Symbols.h | 8 ++------ lld/wasm/Writer.cpp | 22 ++++++++++++++++++---- 11 files changed, 36 insertions(+), 50 deletions(-) diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 9354e6c8e4d2b..670ac3c1f373f 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -113,7 +113,7 @@ ; PASSIVE-NEXT: Name: __wasm_init_memory ; PASSIVE-PIC: - Type: START -; PASSIVE-PIC-NEXT: StartFunction: 3 +; PASSIVE-PIC-NEXT: StartFunction: 2 ; PASSIVE-PIC-NEXT: - Type: DATACOUNT ; PASSIVE-PIC-NEXT: Count: 3 ; PASSIVE-PIC-NEXT: - Type: CODE @@ -125,9 +125,6 @@ ; PASSIVE-PIC-NEXT: Locals: [] ; PASSIVE-PIC-NEXT: Body: {{.*}} ; PASSIVE-PIC-NEXT: - Index: 2 -; PASSIVE-PIC-NEXT: Locals: [] -; PASSIVE-PIC-NEXT: Body: 0B -; PASSIVE-PIC-NEXT: - Index: 3 ; PASSIVE-PIC-NEXT: Locals: ; PASSIVE32-PIC-NEXT: - Type: I32 ; PASSIVE64-PIC-NEXT: - Type: I64 @@ -152,8 +149,6 @@ ; PASSIVE-PIC-NEXT: - Index: 1 ; PASSIVE-PIC-NEXT: Name: __wasm_init_tls ; PASSIVE-PIC-NEXT: - Index: 2 -; PASSIVE-PIC-NEXT: Name: __wasm_apply_data_relocs -; PASSIVE-PIC-NEXT: - Index: 3 ; PASSIVE-PIC-NEXT: Name: __wasm_init_memory ; no data relocations. @@ -161,8 +156,6 @@ ; DIS-EMPTY: ; DIS-NEXT: end -; In PIC mode __wasm_apply_data_relocs is export separatly to __wasm_call_ctors -; PIC-DIS: <__wasm_apply_data_relocs>: ; PIC-DIS-EMPTY: ; DIS-LABEL: <__wasm_init_memory>: diff --git a/lld/test/wasm/shared-weak-symbols.s b/lld/test/wasm/shared-weak-symbols.s index 90de006353b3d..df049ce4600fe 100644 --- a/lld/test/wasm/shared-weak-symbols.s +++ b/lld/test/wasm/shared-weak-symbols.s @@ -30,7 +30,7 @@ call_weak: # ASM: 10 80 80 80 80 00 call 0 drop call hidden_weak_func -# ASM: 10 84 80 80 80 00 call 4 +# ASM: 10 83 80 80 80 00 call 3 end_function # ASM-NEXT: 0b end @@ -62,15 +62,12 @@ call_weak: # CHECK-NEXT: - Name: __wasm_call_ctors # CHECK-NEXT: Kind: FUNCTION # CHECK-NEXT: Index: 1 -# CHECK-NEXT: - Name: __wasm_apply_data_relocs -# CHECK-NEXT: Kind: FUNCTION -# CHECK-NEXT: Index: 2 # CHECK-NEXT: - Name: weak_func # CHECK-NEXT: Kind: FUNCTION -# CHECK-NEXT: Index: 3 +# CHECK-NEXT: Index: 2 # CHECK-NEXT: - Name: call_weak # CHECK-NEXT: Kind: FUNCTION -# CHECK-NEXT: Index: 5 +# CHECK-NEXT: Index: 4 # CHECK-NEXT: - Type: CODE # CHECK: - Type: CUSTOM @@ -81,10 +78,8 @@ call_weak: # CHECK-NEXT: - Index: 1 # CHECK-NEXT: Name: __wasm_call_ctors # CHECK-NEXT: - Index: 2 -# CHECK-NEXT: Name: __wasm_apply_data_relocs -# CHECK-NEXT: - Index: 3 # CHECK-NEXT: Name: weak_func -# CHECK-NEXT: - Index: 4 +# CHECK-NEXT: - Index: 3 # CHECK-NEXT: Name: hidden_weak_func -# CHECK-NEXT: - Index: 5 +# CHECK-NEXT: - Index: 4 # CHECK-NEXT: Name: call_weak diff --git a/lld/test/wasm/tls-export.s b/lld/test/wasm/tls-export.s index 1f64be607abb2..619f9d2df312a 100644 --- a/lld/test/wasm/tls-export.s +++ b/lld/test/wasm/tls-export.s @@ -40,9 +40,6 @@ tls1: # CHECK-NEXT: - Name: __wasm_call_ctors # CHECK-NEXT: Kind: FUNCTION # CHECK-NEXT: Index: 0 -# CHECK-NEXT: - Name: __wasm_apply_data_relocs -# CHECK-NEXT: Kind: FUNCTION -# CHECK-NEXT: Index: 1 # CHECK-NEXT: - Name: tls1 # CHECK-NEXT: Kind: GLOBAL # CHECK-NEXT: Index: 2 diff --git a/lld/test/wasm/tls-non-shared-memory.s b/lld/test/wasm/tls-non-shared-memory.s index a2e2257cc9392..1754fd6254bb8 100644 --- a/lld/test/wasm/tls-non-shared-memory.s +++ b/lld/test/wasm/tls-non-shared-memory.s @@ -127,9 +127,6 @@ tls1: # PIE-NEXT: - Name: memory # PIE-NEXT: Kind: MEMORY # PIE-NEXT: Index: 0 -# PIE-NEXT: - Name: __wasm_apply_data_relocs -# PIE-NEXT: Kind: FUNCTION -# PIE-NEXT: Index: 1 # PIE-NEXT: - Type: # .tdata and .data are combined into single segment in PIC mode. diff --git a/lld/test/wasm/tls-relocations.s b/lld/test/wasm/tls-relocations.s index ebe83227631f4..7260d72535a00 100644 --- a/lld/test/wasm/tls-relocations.s +++ b/lld/test/wasm/tls-relocations.s @@ -66,7 +66,7 @@ tls_sym: # ASM-NEXT: i32.const 16 # ASM-NEXT: memory.init 0, 0 # call to __wasm_apply_tls_relocs -# ASM-NEXT: call 4 +# ASM-NEXT: call 3 # ASM-NEXT: end # ASM: <__wasm_apply_tls_relocs>: diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 2de7dcaeb43d4..289c1217ff5ea 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -917,17 +917,6 @@ static void createSyntheticSymbols() { is64 ? i64ArgSignature : i32ArgSignature, "__wasm_init_tls")); } - - if (ctx.isPic || - config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) { - // For PIC code, or when dynamically importing addresses, we create - // synthetic functions that apply relocations. These get called from - // __wasm_call_ctors before the user-level constructors. - WasmSym::applyDataRelocs = symtab->addSyntheticFunction( - "__wasm_apply_data_relocs", - WASM_SYMBOL_VISIBILITY_DEFAULT | WASM_SYMBOL_EXPORTED, - make(nullSignature, "__wasm_apply_data_relocs")); - } } static void createOptionalSymbols() { diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 975225974aff6..9383dcaeb4f55 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -361,11 +361,12 @@ uint64_t InputChunk::getVA(uint64_t offset) const { // Generate code to apply relocations to the data section at runtime. // This is only called when generating shared libraries (PIC) where address are // not known at static link time. -void InputChunk::generateRelocationCode(raw_ostream &os) const { +bool InputChunk::generateRelocationCode(raw_ostream &os) const { LLVM_DEBUG(dbgs() << "generating runtime relocations: " << name << " count=" << relocations.size() << "\n"); bool is64 = config->is64.value_or(false); + bool generated = false; unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD @@ -378,7 +379,10 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const { uint64_t offset = getVA(rel.Offset) - getInputSectionOffset(); Symbol *sym = file->getSymbol(rel); - if (!ctx.isPic && sym->isDefined()) + // Runtime relocations are needed when we don't know the address of + // a symbol statically. + bool requiresRuntimeReloc = ctx.isPic || sym->hasGOTIndex(); + if (!requiresRuntimeReloc) continue; LLVM_DEBUG(dbgs() << "gen reloc: type=" << relocTypeToString(rel.Type) @@ -435,7 +439,9 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const { writeU8(os, opcode_reloc_store, "I32_STORE"); writeUleb128(os, 2, "align"); writeUleb128(os, 0, "offset"); + generated = true; } + return generated; } // Split WASM_SEG_FLAG_STRINGS section. Such a section is a sequence of diff --git a/lld/wasm/InputChunks.h b/lld/wasm/InputChunks.h index 5174439facc67..14eb008c212fb 100644 --- a/lld/wasm/InputChunks.h +++ b/lld/wasm/InputChunks.h @@ -78,7 +78,7 @@ class InputChunk { size_t getNumRelocations() const { return relocations.size(); } void writeRelocations(llvm::raw_ostream &os) const; - void generateRelocationCode(raw_ostream &os) const; + bool generateRelocationCode(raw_ostream &os) const; bool isTLS() const { return flags & llvm::wasm::WASM_SEG_FLAG_TLS; } bool isRetained() const { return flags & llvm::wasm::WASM_SEG_FLAG_RETAIN; } diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp index f74699d0763fd..b2bbd11c53ef2 100644 --- a/lld/wasm/Symbols.cpp +++ b/lld/wasm/Symbols.cpp @@ -80,7 +80,6 @@ namespace wasm { DefinedFunction *WasmSym::callCtors; DefinedFunction *WasmSym::callDtors; DefinedFunction *WasmSym::initMemory; -DefinedFunction *WasmSym::applyDataRelocs; DefinedFunction *WasmSym::applyGlobalRelocs; DefinedFunction *WasmSym::applyTLSRelocs; DefinedFunction *WasmSym::applyGlobalTLSRelocs; diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h index 2ba575fddc879..5ce3ecbc4ab19 100644 --- a/lld/wasm/Symbols.h +++ b/lld/wasm/Symbols.h @@ -591,18 +591,14 @@ struct WasmSym { // Function that calls the libc/etc. cleanup function. static DefinedFunction *callDtors; - // __wasm_apply_data_relocs - // Function that applies relocations to data segment post-instantiation. - static DefinedFunction *applyDataRelocs; - // __wasm_apply_global_relocs // Function that applies relocations to wasm globals post-instantiation. // Unlike __wasm_apply_data_relocs this needs to run on every thread. static DefinedFunction *applyGlobalRelocs; // __wasm_apply_tls_relocs - // Like applyDataRelocs but for TLS section. These must be delayed until - // __wasm_init_tls. + // Like __wasm_apply_data_relocs but for TLS section. These must be + // delayed until __wasm_init_tls. static DefinedFunction *applyTLSRelocs; // __wasm_apply_global_tls_relocs diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index 681f6a137ceac..77cddfc34389c 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -1145,6 +1145,8 @@ void Writer::createSyntheticInitFunctions() { static WasmSignature nullSignature = {{}, {}}; + createApplyDataRelocationsFunction(); + // Passive segments are used to avoid memory being reinitialized on each // thread's instantiation. These passive segments are initialized and // dropped in __wasm_init_memory, which is registered as the start function @@ -1467,15 +1469,29 @@ void Writer::createApplyDataRelocationsFunction() { { raw_string_ostream os(bodyContent); writeUleb128(os, 0, "num locals"); + bool generated = false; for (const OutputSegment *seg : segments) if (!config->sharedMemory || !seg->isTLS()) for (const InputChunk *inSeg : seg->inputSegments) - inSeg->generateRelocationCode(os); + generated |= inSeg->generateRelocationCode(os); + if (!generated) { + LLVM_DEBUG(dbgs() << "skipping empty __wasm_apply_data_relocs\n"); + return; + } writeU8(os, WASM_OPCODE_END, "END"); } - createFunction(WasmSym::applyDataRelocs, bodyContent); + // __wasm_apply_data_relocs + // Function that applies relocations to data segment post-instantiation. + static WasmSignature nullSignature = {{}, {}}; + auto def = symtab->addSyntheticFunction( + "__wasm_apply_data_relocs", + WASM_SYMBOL_VISIBILITY_DEFAULT | WASM_SYMBOL_EXPORTED, + make(nullSignature, "__wasm_apply_data_relocs")); + def->markLive(); + + createFunction(def, bodyContent); } void Writer::createApplyTLSRelocationsFunction() { @@ -1771,8 +1787,6 @@ void Writer::run() { if (!config->relocatable) { // Create linker synthesized functions - if (WasmSym::applyDataRelocs) - createApplyDataRelocationsFunction(); if (WasmSym::applyGlobalRelocs) createApplyGlobalRelocationsFunction(); if (WasmSym::applyTLSRelocs) From 50e5411e4247421fd606f0a206682fcdf0303ae3 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Tue, 1 Oct 2024 08:19:35 +0800 Subject: [PATCH 037/151] [Clang][Sema] Retain the expanding index for unevaluated type constraints (#109518) (This continues the effort of #86265, fixing another piece of issue in constraint evaluation on variadic lambdas.) We need the depth of the primary template parameters for constraint substitution. To that end, we avoided substituting type constraints by copying the constraint expression when instantiating a template. This, however, has left an issue in that for lambda's parameters, they can reference outer template packs that would be expanded in the process of an instantiation, where these parameters would make their way into the constraint evaluation, wherein we have no other way to expand them later in evaluation. For example, template void foo() { bar([](C auto value) {}...); } The lambda references a pack `Ts` that should be expanded when instantiating `foo()`. The `Ts` along with the constraint expression would not be transformed until constraint evaluation, and at that point, we would have no chance to expand `Ts` anyhow. This patch takes an approach that transforms `Ts` from an unexpanded TemplateTypeParmType into a SubstTemplateTypeParmType with the current pack substitution index, such that we could use that to expand the type during evaluation. Fixes #101754 --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Sema/Sema.h | 1 + clang/lib/Sema/SemaTemplate.cpp | 6 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 127 +++++++++++++++++- clang/lib/Sema/SemaType.cpp | 8 +- .../SemaCXX/fold_lambda_with_variadics.cpp | 54 ++++++++ 6 files changed, 191 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1681ae8049a73..a7c1bb80a49db 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -451,6 +451,8 @@ Bug Fixes to C++ Support diagnosing a failed cast caused indirectly by a failed implicit conversion to the type of the constructor parameter. - Fixed an assertion failure by adjusting integral to boolean vector conversions (#GH108326) - Mangle friend function templates with a constraint that depends on a template parameter from an enclosing template as members of the enclosing class. (#GH110247) +- Fixed an issue in constraint evaluation, where type constraints on the lambda expression + containing outer unexpanded parameters were not correctly expanded. (#GH101754) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a9ce3681338d4..d616c3834c429 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11253,6 +11253,7 @@ class Sema final : public SemaBase { ConceptDecl *NamedConcept, NamedDecl *FoundDecl, const TemplateArgumentListInfo *TemplateArgs, TemplateTypeParmDecl *ConstrainedParameter, + QualType ConstrainedType, SourceLocation EllipsisLoc); bool AttachTypeConstraint(AutoTypeLoc TL, diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 99423b01114cc..c7d48b81bc034 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1134,7 +1134,8 @@ bool Sema::BuildTypeConstraint(const CXXScopeSpec &SS, SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc(), ConceptName, CD, /*FoundDecl=*/USD ? cast(USD) : CD, TypeConstr->LAngleLoc.isValid() ? &TemplateArgs : nullptr, - ConstrainedParameter, EllipsisLoc); + ConstrainedParameter, Context.getTypeDeclType(ConstrainedParameter), + EllipsisLoc); } template @@ -1191,6 +1192,7 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, ConceptDecl *NamedConcept, NamedDecl *FoundDecl, const TemplateArgumentListInfo *TemplateArgs, TemplateTypeParmDecl *ConstrainedParameter, + QualType ConstrainedType, SourceLocation EllipsisLoc) { // C++2a [temp.param]p4: // [...] If Q is of the form C, then let E' be @@ -1199,7 +1201,7 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, TemplateArgs ? ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs) : nullptr; - QualType ParamAsArgument(ConstrainedParameter->getTypeForDecl(), 0); + QualType ParamAsArgument = ConstrainedType; ExprResult ImmediatelyDeclaredConstraint = formImmediatelyDeclaredConstraint( *this, NS, NameInfo, NamedConcept, FoundDecl, diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index fd51fa4afcacb..e874ab563e2f8 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1655,6 +1655,21 @@ namespace { SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime); + QualType + TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB, + SubstTemplateTypeParmTypeLoc TL) { + if (SemaRef.CodeSynthesisContexts.back().Kind != + Sema::CodeSynthesisContext::ConstraintSubstitution) + return inherited::TransformSubstTemplateTypeParmType(TLB, TL); + + auto PackIndex = TL.getTypePtr()->getPackIndex(); + std::optional SubstIndex; + if (SemaRef.ArgumentPackSubstitutionIndex == -1 && PackIndex) + SubstIndex.emplace(SemaRef, *PackIndex); + + return inherited::TransformSubstTemplateTypeParmType(TLB, TL); + } + CXXRecordDecl::LambdaDependencyKind ComputeLambdaDependency(LambdaScopeInfo *LSI) { if (auto TypeAlias = @@ -3078,6 +3093,58 @@ namespace { } // namespace +namespace { + +struct ExpandPackedTypeConstraints + : TreeTransform { + + using inherited = TreeTransform; + + ExpandPackedTypeConstraints(Sema &SemaRef) : inherited(SemaRef) {} + + using inherited::TransformTemplateTypeParmType; + + QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, + TemplateTypeParmTypeLoc TL, bool) { + const TemplateTypeParmType *T = TL.getTypePtr(); + if (!T->isParameterPack()) { + TemplateTypeParmTypeLoc NewTL = + TLB.push(TL.getType()); + NewTL.setNameLoc(TL.getNameLoc()); + return TL.getType(); + } + + assert(SemaRef.ArgumentPackSubstitutionIndex != -1); + + QualType Result = SemaRef.Context.getSubstTemplateTypeParmType( + TL.getType(), T->getDecl(), T->getIndex(), + SemaRef.ArgumentPackSubstitutionIndex); + SubstTemplateTypeParmTypeLoc NewTL = + TLB.push(Result); + NewTL.setNameLoc(TL.getNameLoc()); + return Result; + } + + QualType TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB, + SubstTemplateTypeParmTypeLoc TL) { + const SubstTemplateTypeParmType *T = TL.getTypePtr(); + if (T->getPackIndex()) { + SubstTemplateTypeParmTypeLoc TypeLoc = + TLB.push(TL.getType()); + TypeLoc.setNameLoc(TL.getNameLoc()); + return TypeLoc.getType(); + } + return inherited::TransformSubstTemplateTypeParmType(TLB, TL); + } + + bool SubstTemplateArguments(ArrayRef Args, + TemplateArgumentListInfo &Out) { + return inherited::TransformTemplateArguments(Args.begin(), Args.end(), Out); + } +}; + +} // namespace + bool Sema::SubstTypeConstraint( TemplateTypeParmDecl *Inst, const TypeConstraint *TC, const MultiLevelTemplateArgumentList &TemplateArgs, @@ -3086,9 +3153,62 @@ bool Sema::SubstTypeConstraint( TC->getTemplateArgsAsWritten(); if (!EvaluateConstraints) { - Inst->setTypeConstraint(TC->getConceptReference(), - TC->getImmediatelyDeclaredConstraint()); - return false; + bool ShouldExpandExplicitTemplateArgs = + TemplArgInfo && ArgumentPackSubstitutionIndex != -1 && + llvm::any_of(TemplArgInfo->arguments(), [](auto &Arg) { + return Arg.getArgument().containsUnexpandedParameterPack(); + }); + + // We want to transform the packs into Subst* nodes for type constraints + // inside a pack expansion. For example, + // + // template void foo() { + // bar([](C auto value) {}...); + // } + // + // As we expand Ts in the process of instantiating foo(), and retain + // the original template depths of Ts until the constraint evaluation, we + // would otherwise have no chance to expand Ts by the time of evaluating + // C. + // + // So we form a Subst* node for Ts along with a proper substitution index + // here, and substitute the node with a complete MLTAL later in evaluation. + if (ShouldExpandExplicitTemplateArgs) { + TemplateArgumentListInfo InstArgs; + InstArgs.setLAngleLoc(TemplArgInfo->LAngleLoc); + InstArgs.setRAngleLoc(TemplArgInfo->RAngleLoc); + if (ExpandPackedTypeConstraints(*this).SubstTemplateArguments( + TemplArgInfo->arguments(), InstArgs)) + return true; + + // The type of the original parameter. + auto *ConstraintExpr = TC->getImmediatelyDeclaredConstraint(); + QualType ConstrainedType; + + if (auto *FE = dyn_cast(ConstraintExpr)) { + assert(FE->getLHS()); + ConstraintExpr = FE->getLHS(); + } + auto *CSE = cast(ConstraintExpr); + assert(!CSE->getTemplateArguments().empty() && + "Empty template arguments?"); + ConstrainedType = CSE->getTemplateArguments()[0].getAsType(); + assert(!ConstrainedType.isNull() && + "Failed to extract the original ConstrainedType?"); + + return AttachTypeConstraint( + TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), + TC->getNamedConcept(), + /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, + Inst, ConstrainedType, + Inst->isParameterPack() + ? cast(TC->getImmediatelyDeclaredConstraint()) + ->getEllipsisLoc() + : SourceLocation()); + } + Inst->setTypeConstraint(TC->getConceptReference(), + TC->getImmediatelyDeclaredConstraint()); + return false; } TemplateArgumentListInfo InstArgs; @@ -3104,6 +3224,7 @@ bool Sema::SubstTypeConstraint( TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), TC->getNamedConcept(), /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, Inst, + Context.getTypeDeclType(Inst), Inst->isParameterPack() ? cast(TC->getImmediatelyDeclaredConstraint()) ->getEllipsisLoc() diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index a7beb9d222c3b..c44fc9c4194ca 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -3035,7 +3035,9 @@ InventTemplateParameter(TypeProcessingState &state, QualType T, AutoLoc.getNestedNameSpecifierLoc(), AutoLoc.getConceptNameInfo(), AutoLoc.getNamedConcept(), /*FoundDecl=*/AutoLoc.getFoundDecl(), AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr, - InventedTemplateParam, D.getEllipsisLoc()); + InventedTemplateParam, + S.Context.getTypeDeclType(InventedTemplateParam), + D.getEllipsisLoc()); } } else { // The 'auto' appears in the decl-specifiers; we've not finished forming @@ -3072,7 +3074,9 @@ InventTemplateParameter(TypeProcessingState &state, QualType T, /*FoundDecl=*/ USD ? cast(USD) : CD, TemplateId->LAngleLoc.isValid() ? &TemplateArgsInfo : nullptr, - InventedTemplateParam, D.getEllipsisLoc()); + InventedTemplateParam, + S.Context.getTypeDeclType(InventedTemplateParam), + D.getEllipsisLoc()); } } } diff --git a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp index 14e242f009dc5..2257a4c2d975a 100644 --- a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp +++ b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp @@ -179,3 +179,57 @@ void foo() { } } // namespace GH99877 + +namespace GH101754 { + +template struct Overloaded : Ts... { + using Ts::operator()...; +}; + +template Overloaded(Ts...) -> Overloaded; + +template +concept same_as = __is_same(T, U); // #same_as + +template constexpr auto foo() { + return Overloaded{[](same_as auto value) { return value; }...}; // #lambda +} + +static_assert(foo()(123) == 123); +static_assert(foo()(2.718) == 2.718); + +static_assert(foo()('c')); +// expected-error@-1 {{no matching function}} + +// expected-note@#lambda {{constraints not satisfied}} +// expected-note@#lambda {{'same_as' evaluated to false}} +// expected-note@#same_as {{evaluated to false}} + +// expected-note@#lambda {{constraints not satisfied}} +// expected-note@#lambda {{'same_as' evaluated to false}} +// expected-note@#same_as {{evaluated to false}} + +template +concept C = same_as && same_as; // #C + +template constexpr auto bar() { + return ([]() { + return Overloaded{[](C auto value) { // #bar + return value; + }...}; + }.template operator()(), ...); +} +static_assert(bar()(3.14f)); // OK, bar() returns the last overload i.e. . + +static_assert(bar()(123)); +// expected-error@-1 {{no matching function}} +// expected-note@#bar {{constraints not satisfied}} +// expected-note@#bar {{'C' evaluated to false}} +// expected-note@#C {{evaluated to false}} + +// expected-note@#bar {{constraints not satisfied}} +// expected-note@#bar {{'C' evaluated to false}} +// expected-note@#C {{evaluated to false}} +// expected-note@#same_as 2{{evaluated to false}} + +} // namespace GH101754 From 78ff3401482384203b8ea664eee20fb81f8fb933 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Mon, 30 Sep 2024 17:56:32 -0700 Subject: [PATCH 038/151] [LLDB][Minidump] Fix bug where we were using the wrong collection for thread stacks (#110579) In my prior two save core API's, I experimented on how to save stacks with the new API. I incorrectly left these in, as the existing `m_thread_by_range_end` was the correct choice. I have removed the no-op collection, and moved to use the proper one. It's worth noting this was not caught by testing because we do not verify where the items are contained in the minidump. This would require a test being aware of how minidumps are structured, or adding a textual tool that we can then scan the output of. --- lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp | 2 +- lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index 3f1e25f730a18..f6c16b6e3d96a 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -853,7 +853,7 @@ Status MinidumpFileBuilder::AddMemoryList() { uint64_t total_size = GetCurrentDataEndOffset(); auto iterator = all_core_memory_vec.begin(); while (iterator != all_core_memory_vec.end()) { - if (m_saved_stack_ranges.count(iterator->range.start()) > 0) { + if (m_thread_by_range_end.count(iterator->range.end()) > 0) { // We don't save stacks twice. ranges_32.push_back(*iterator); total_size += diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h index d5eac9015ac42..a4240f871c8a2 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h @@ -172,7 +172,6 @@ class MinidumpFileBuilder { // to duplicate it in the exception data. std::unordered_map m_tid_to_reg_ctx; - std::unordered_set m_saved_stack_ranges; lldb::FileUP m_core_file; lldb_private::SaveCoreOptions m_save_core_options; }; From 45e1a38a10371b3ccf2a2199a0c16c6112751a3d Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Tue, 1 Oct 2024 01:11:19 +0000 Subject: [PATCH 039/151] [PGO] use -fprofile-update=atomic instead of mllvm option in ContinuousSyncMode/set-file-object.c because on some platforms (e.g. AIX) the compiler adds -latomic to the link step in 32-bit. --- compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c index c7eb27057a923..321a69b4f23d0 100644 --- a/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c +++ b/compiler-rt/test/profile/ContinuousSyncMode/set-file-object.c @@ -5,7 +5,7 @@ // RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir // The -mllvm -runtime-counter-relocation=true flag has effect only on linux. -// RUN: %clang -fprofile-instr-generate -fcoverage-mapping -mllvm -instrprof-atomic-counter-update-all=1 -mllvm -runtime-counter-relocation=true -o main.exe %s +// RUN: %clang -fprofile-instr-generate -fcoverage-mapping -fprofile-update=atomic -mllvm -runtime-counter-relocation=true -o main.exe %s // Test continuous mode with __llvm_profile_set_file_object with mergin disabled. // RUN: env LLVM_PROFILE_FILE="%t.dir/profdir/%c%mprofraw.old" %run %t.dir/main.exe nomerge %t.dir/profdir/profraw.new 2>&1 | FileCheck %s -check-prefix=WARN From 4852374135773b03c14ba2003be99ed1169dedf4 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 1 Oct 2024 02:05:28 +0100 Subject: [PATCH 040/151] [llvm][opt][Transforms] Replacement `calloc` should match replaced `malloc` (#110524) Currently DSE unconditionally emits `calloc` as returning a pointer to AS0. However, this is incorrect for targets that have a non-zero default AS, as it'd not match the `malloc` signature. This patch addresses that by piping through the AS for the pointer returned by `malloc` into the `calloc` insertion call. --- .../llvm/Transforms/Utils/BuildLibCalls.h | 2 +- .../Transforms/Scalar/DeadStoreElimination.cpp | 5 +++-- llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 6 +++--- .../malloc-to-calloc-with-nonzero-default-as.ll | 17 +++++++++++++++++ 4 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h index 1979c4af770b0..a8fb38e726004 100644 --- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h @@ -251,7 +251,7 @@ namespace llvm { /// Emit a call to the calloc function. Value *emitCalloc(Value *Num, Value *Size, IRBuilderBase &B, - const TargetLibraryInfo &TLI); + const TargetLibraryInfo &TLI, unsigned AddrSpace); /// Emit a call to the hot/cold operator new function. Value *emitHotColdNew(Value *Num, IRBuilderBase &B, diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a304f7b056f5f..ce8c988ba531d 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1944,8 +1944,9 @@ struct DSEState { return false; IRBuilder<> IRB(Malloc); Type *SizeTTy = Malloc->getArgOperand(0)->getType(); - auto *Calloc = emitCalloc(ConstantInt::get(SizeTTy, 1), - Malloc->getArgOperand(0), IRB, TLI); + auto *Calloc = + emitCalloc(ConstantInt::get(SizeTTy, 1), Malloc->getArgOperand(0), IRB, + TLI, Malloc->getType()->getPointerAddressSpace()); if (!Calloc) return false; diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index d4727dece19f6..7bb4b55fcb7cf 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1978,15 +1978,15 @@ Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL, } Value *llvm::emitCalloc(Value *Num, Value *Size, IRBuilderBase &B, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, unsigned AddrSpace) { Module *M = B.GetInsertBlock()->getModule(); if (!isLibFuncEmittable(M, &TLI, LibFunc_calloc)) return nullptr; StringRef CallocName = TLI.getName(LibFunc_calloc); Type *SizeTTy = getSizeTTy(B, &TLI); - FunctionCallee Calloc = getOrInsertLibFunc(M, TLI, LibFunc_calloc, - B.getPtrTy(), SizeTTy, SizeTTy); + FunctionCallee Calloc = getOrInsertLibFunc( + M, TLI, LibFunc_calloc, B.getPtrTy(AddrSpace), SizeTTy, SizeTTy); inferNonMandatoryLibFuncAttrs(M, CallocName, TLI); CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName); diff --git a/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll b/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll new file mode 100644 index 0000000000000..977bf93fa856e --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/malloc-to-calloc-with-nonzero-default-as.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=dse < %s | FileCheck %s + +define ptr addrspace(4) @malloc_to_calloc(i64 %size) { +; CHECK-LABEL: define ptr addrspace(4) @malloc_to_calloc( +; CHECK-SAME: i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[CALLOC:%.*]] = call ptr addrspace(4) @calloc(i64 1, i64 [[SIZE]]) +; CHECK-NEXT: ret ptr addrspace(4) [[CALLOC]] +; + %ret = call ptr addrspace(4) @malloc(i64 %size) + call void @llvm.memset.p4.i64(ptr addrspace(4) %ret, i8 0, i64 %size, i1 false) + ret ptr addrspace(4) %ret +} + +declare void @llvm.memset.p4.i64(ptr addrspace(4) nocapture writeonly, i8, i64, i1 immarg) + +declare noalias ptr addrspace(4) @malloc(i64) willreturn allockind("alloc,uninitialized") "alloc-family"="malloc" From 75fad470318958656c50f3842024bd383445d419 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Mon, 30 Sep 2024 15:05:50 -0500 Subject: [PATCH 041/151] [llvm][AMDGPU] Avoid Type::getPointerTo() (NFC) `llvm::Type::getPointerTo()` is to be deprecated & removed soon. --- llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 40d2450d775fa..4c596e37476c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -953,11 +953,10 @@ static Type* getIntrinsicParamType( case AMDGPULibFunc::IMG1D: case AMDGPULibFunc::IMG2D: case AMDGPULibFunc::IMG3D: - T = StructType::create(C,"ocl_image")->getPointerTo(); break; case AMDGPULibFunc::SAMPLER: - T = StructType::create(C,"ocl_sampler")->getPointerTo(); break; case AMDGPULibFunc::EVENT: - T = StructType::create(C,"ocl_event")->getPointerTo(); break; + T = PointerType::getUnqual(C); + break; default: llvm_unreachable("Unhandled param type"); return nullptr; @@ -965,9 +964,8 @@ static Type* getIntrinsicParamType( if (P.VectorSize > 1) T = FixedVectorType::get(T, P.VectorSize); if (P.PtrKind != AMDGPULibFunc::BYVALUE) - T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - - 1) - : T->getPointerTo(); + T = PointerType::get( + C, useAddrSpace ? ((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - 1) : 0); return T; } From 00128a20eec27246719d73ba427bf821883b00b4 Mon Sep 17 00:00:00 2001 From: realqhc Date: Tue, 1 Oct 2024 11:22:02 +1000 Subject: [PATCH 042/151] [RISCV] Implement Clang Builtins for XCValu Extension in CV32E40P (#100684) This commit adds the Clang Builtins, C API header and relevant tests for XCValu extension. Spec: https://github.com/openhwgroup/core-v-sw/blob/master/specifications/corev-builtin-spec.md Contributor: @melonedo, @PaoloS02 --- clang/include/clang/Basic/BuiltinsRISCV.td | 5 + clang/include/clang/Basic/BuiltinsRISCVXCV.td | 41 ++ clang/lib/CodeGen/CGBuiltin.cpp | 54 ++- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/riscv_corev_alu.h | 128 ++++++ clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c | 434 ++++++++++++++++++ clang/test/CodeGen/RISCV/riscv-xcvalu.c | 249 ++++++++++ llvm/include/llvm/IR/IntrinsicsRISCVXCV.td | 16 +- llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td | 17 +- llvm/test/CodeGen/RISCV/xcvalu.ll | 132 +++--- 10 files changed, 1003 insertions(+), 74 deletions(-) create mode 100644 clang/include/clang/Basic/BuiltinsRISCVXCV.td create mode 100644 clang/lib/Headers/riscv_corev_alu.h create mode 100644 clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c create mode 100644 clang/test/CodeGen/RISCV/riscv-xcvalu.c diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td index 4cc89a8a9d8af..3263603a8a1cf 100644 --- a/clang/include/clang/Basic/BuiltinsRISCV.td +++ b/clang/include/clang/Basic/BuiltinsRISCV.td @@ -146,3 +146,8 @@ let Features = "zihintntl", Attributes = [CustomTypeChecking] in { def ntl_load : RISCVBuiltin<"void(...)">; def ntl_store : RISCVBuiltin<"void(...)">; } // Features = "zihintntl", Attributes = [CustomTypeChecking] + +//===----------------------------------------------------------------------===// +// XCV extensions. +//===----------------------------------------------------------------------===// +include "clang/Basic/BuiltinsRISCVXCV.td" diff --git a/clang/include/clang/Basic/BuiltinsRISCVXCV.td b/clang/include/clang/Basic/BuiltinsRISCVXCV.td new file mode 100644 index 0000000000000..06ce07ade5c12 --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsRISCVXCV.td @@ -0,0 +1,41 @@ +//==- BuiltinsRISCVXCV.td - RISC-V CORE-V Builtin database ----*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the CORE-V-specific builtin function database. Users of +// this file must define the BUILTIN macro to make use of this information. +// +//===----------------------------------------------------------------------===// + +class RISCXCVBuiltin : TargetBuiltin { + let Spellings = ["__builtin_riscv_cv_" # NAME]; + let Prototype = prototype; + let Features = features; +} + +let Attributes = [NoThrow, Const] in { +//===----------------------------------------------------------------------===// +// XCValu extension. +//===----------------------------------------------------------------------===// +def alu_slet : RISCXCVBuiltin<"int(int, int)", "xcvalu">; +def alu_sletu : RISCXCVBuiltin<"int(unsigned int, unsigned int)", "xcvalu">; +def alu_exths : RISCXCVBuiltin<"int(int)", "xcvalu">; +def alu_exthz : RISCXCVBuiltin<"unsigned int(unsigned int)", "xcvalu">; +def alu_extbs : RISCXCVBuiltin<"int(int)", "xcvalu">; +def alu_extbz : RISCXCVBuiltin<"unsigned int(unsigned int)", "xcvalu">; + +def alu_clip : RISCXCVBuiltin<"int(int, int)", "xcvalu">; +def alu_clipu : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int)", "xcvalu">; +def alu_addN : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">; +def alu_adduN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">; +def alu_addRN : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">; +def alu_adduRN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">; +def alu_subN : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">; +def alu_subuN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">; +def alu_subRN : RISCXCVBuiltin<"int(int, int, unsigned int)", "xcvalu">; +def alu_subuRN : RISCXCVBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", "xcvalu">; +} // Attributes = [NoThrow, Const] diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d739597de4c85..da3eca73bfb57 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -22340,10 +22340,60 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, return Store; } + // XCValu + case RISCV::BI__builtin_riscv_cv_alu_addN: + ID = Intrinsic::riscv_cv_alu_addN; + break; + case RISCV::BI__builtin_riscv_cv_alu_addRN: + ID = Intrinsic::riscv_cv_alu_addRN; + break; + case RISCV::BI__builtin_riscv_cv_alu_adduN: + ID = Intrinsic::riscv_cv_alu_adduN; + break; + case RISCV::BI__builtin_riscv_cv_alu_adduRN: + ID = Intrinsic::riscv_cv_alu_adduRN; + break; + case RISCV::BI__builtin_riscv_cv_alu_clip: + ID = Intrinsic::riscv_cv_alu_clip; + break; + case RISCV::BI__builtin_riscv_cv_alu_clipu: + ID = Intrinsic::riscv_cv_alu_clipu; + break; + case RISCV::BI__builtin_riscv_cv_alu_extbs: + return Builder.CreateSExt(Builder.CreateTrunc(Ops[0], Int8Ty), Int32Ty, + "extbs"); + case RISCV::BI__builtin_riscv_cv_alu_extbz: + return Builder.CreateZExt(Builder.CreateTrunc(Ops[0], Int8Ty), Int32Ty, + "extbz"); + case RISCV::BI__builtin_riscv_cv_alu_exths: + return Builder.CreateSExt(Builder.CreateTrunc(Ops[0], Int16Ty), Int32Ty, + "exths"); + case RISCV::BI__builtin_riscv_cv_alu_exthz: + return Builder.CreateZExt(Builder.CreateTrunc(Ops[0], Int16Ty), Int32Ty, + "exthz"); + case RISCV::BI__builtin_riscv_cv_alu_slet: + return Builder.CreateZExt(Builder.CreateICmpSLE(Ops[0], Ops[1]), Int32Ty, + "sle"); + case RISCV::BI__builtin_riscv_cv_alu_sletu: + return Builder.CreateZExt(Builder.CreateICmpULE(Ops[0], Ops[1]), Int32Ty, + "sleu"); + case RISCV::BI__builtin_riscv_cv_alu_subN: + ID = Intrinsic::riscv_cv_alu_subN; + break; + case RISCV::BI__builtin_riscv_cv_alu_subRN: + ID = Intrinsic::riscv_cv_alu_subRN; + break; + case RISCV::BI__builtin_riscv_cv_alu_subuN: + ID = Intrinsic::riscv_cv_alu_subuN; + break; + case RISCV::BI__builtin_riscv_cv_alu_subuRN: + ID = Intrinsic::riscv_cv_alu_subuRN; + break; - // Vector builtins are handled from here. + // Vector builtins are handled from here. #include "clang/Basic/riscv_vector_builtin_cg.inc" - // SiFive Vector builtins are handled from here. + + // SiFive Vector builtins are handled from here. #include "clang/Basic/riscv_sifive_vector_builtin_cg.inc" } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index f5cc07c303f9e..ff392e7122a44 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -120,6 +120,7 @@ set(ppc_htm_files set(riscv_files riscv_bitmanip.h + riscv_corev_alu.h riscv_crypto.h riscv_ntlh.h sifive_vector.h diff --git a/clang/lib/Headers/riscv_corev_alu.h b/clang/lib/Headers/riscv_corev_alu.h new file mode 100644 index 0000000000000..d2832ddf72efe --- /dev/null +++ b/clang/lib/Headers/riscv_corev_alu.h @@ -0,0 +1,128 @@ +/*===---- riscv_corev_alu.h - CORE-V ALU intrinsics ------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __RISCV_COREV_ALU_H +#define __RISCV_COREV_ALU_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__riscv_xcvalu) + +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_abs(long a) { + return __builtin_abs(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_slet(long a, long b) { + return __builtin_riscv_cv_alu_slet(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS +__riscv_cv_alu_sletu(unsigned long a, unsigned long b) { + return __builtin_riscv_cv_alu_sletu(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_min(long a, long b) { + return __builtin_elementwise_min(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_minu(unsigned long a, unsigned long b) { + return __builtin_elementwise_min(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_max(long a, long b) { + return __builtin_elementwise_max(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_maxu(unsigned long a, unsigned long b) { + return __builtin_elementwise_max(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_exths(int16_t a) { + return __builtin_riscv_cv_alu_exths(a); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_exthz(uint16_t a) { + return __builtin_riscv_cv_alu_exthz(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_extbs(int8_t a) { + return __builtin_riscv_cv_alu_extbs(a); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_extbz(uint8_t a) { + return __builtin_riscv_cv_alu_extbz(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_clip(long a, + unsigned long b) { + return __builtin_riscv_cv_alu_clip(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_clipu(unsigned long a, unsigned long b) { + return __builtin_riscv_cv_alu_clipu(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_addN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_adduN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_adduN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addRN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_addRN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_adduRN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_adduRN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_subN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_subuN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_subuN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subRN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_subRN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_subuRN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_subuRN(a, b, shft); +} + +#endif // defined(__riscv_xcvalu) + +#if defined(__cplusplus) +} +#endif + +#endif // define __RISCV_COREV_ALU_H diff --git a/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c b/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c new file mode 100644 index 0000000000000..b4690a5f1c1ca --- /dev/null +++ b/clang/test/CodeGen/RISCV/riscv-xcvalu-c-api.c @@ -0,0 +1,434 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple riscv32 -target-feature +xcvalu -emit-llvm %s -o - \ +// RUN: | FileCheck %s + +#include +#include + +// CHECK-LABEL: @test_alu_slet( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = icmp sle i32 [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[SLE_I:%.*]] = zext i1 [[TMP4]] to i32 +// CHECK-NEXT: ret i32 [[SLE_I]] +// +int test_alu_slet(int32_t a, int32_t b) { + return __riscv_cv_alu_slet(a, b); +} + +// CHECK-LABEL: @test_alu_sletu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = icmp ule i32 [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[SLEU_I:%.*]] = zext i1 [[TMP4]] to i32 +// CHECK-NEXT: ret i32 [[SLEU_I]] +// +int test_alu_sletu(uint32_t a, uint32_t b) { + return __riscv_cv_alu_sletu(a, b); +} + +// CHECK-LABEL: @test_alu_min( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[ELT_MIN_I:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: ret i32 [[ELT_MIN_I]] +// +int test_alu_min(int32_t a, int32_t b) { + return __riscv_cv_alu_min(a, b); +} + +// CHECK-LABEL: @test_alu_minu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[ELT_MIN_I:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: ret i32 [[ELT_MIN_I]] +// +int test_alu_minu(uint32_t a, uint32_t b) { + return __riscv_cv_alu_minu(a, b); +} + +// CHECK-LABEL: @test_alu_max( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[ELT_MAX_I:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: ret i32 [[ELT_MAX_I]] +// +int test_alu_max(int32_t a, int32_t b) { + return __riscv_cv_alu_max(a, b); +} + +// CHECK-LABEL: @test_alu_maxu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[ELT_MAX_I:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: ret i32 [[ELT_MAX_I]] +// +int test_alu_maxu(uint32_t a, uint32_t b) { + return __riscv_cv_alu_maxu(a, b); +} + +// CHECK-LABEL: @test_alu_exths( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A_ADDR_I]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[A_ADDR_I]], align 2 +// CHECK-NEXT: [[CONV_I:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[EXTHS_I:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTHS_I]] +// +int test_alu_exths(int16_t a) { + return __riscv_cv_alu_exths(a); +} + +// CHECK-LABEL: @test_alu_exthz( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A_ADDR_I]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[A_ADDR_I]], align 2 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[EXTHZ_I:%.*]] = zext i16 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTHZ_I]] +// +int test_alu_exthz(uint16_t a) { + return __riscv_cv_alu_exthz(a); +} + +// CHECK-LABEL: @test_alu_extbs( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8 [[A:%.*]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[A_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = sext i8 [[TMP1]] to i32 +// CHECK-NEXT: [[EXTBS_I:%.*]] = sext i8 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTBS_I]] +// +int test_alu_extbs(int8_t a) { + return __riscv_cv_alu_extbs(a); +} + +// CHECK-LABEL: @test_alu_extbz( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8 [[A:%.*]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[A_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP1]] to i32 +// CHECK-NEXT: [[EXTBZ_I:%.*]] = zext i8 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTBZ_I]] +// +int test_alu_extbz(uint8_t a) { + return __riscv_cv_alu_extbz(a); +} + +// CHECK-LABEL: @test_alu_clip( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 0, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.riscv.cv.alu.clip(i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: ret i32 [[TMP3]] +// +int test_alu_clip(int32_t a) { + return __riscv_cv_alu_clip(a, 0); +} + +// CHECK-LABEL: @test_alu_clipu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 0, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.riscv.cv.alu.clipu(i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: ret i32 [[TMP3]] +// +int test_alu_clipu(uint32_t a) { + return __riscv_cv_alu_clipu(a, 0); +} + +// CHECK-LABEL: @test_alu_addN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.addN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_addN(int32_t a, int32_t b) { + return __riscv_cv_alu_addN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_adduN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.adduN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_adduN(uint32_t a, uint32_t b) { + return __riscv_cv_alu_adduN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_addRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.addRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_addRN(int32_t a, int32_t b) { + return __riscv_cv_alu_addRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_adduRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.adduRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_adduRN(uint32_t a, uint32_t b) { + return __riscv_cv_alu_adduRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_subN(int32_t a, int32_t b) { + return __riscv_cv_alu_subN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subuN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subuN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_subuN(uint32_t a, uint32_t b) { + return __riscv_cv_alu_subuN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_subRN(int32_t a, int32_t b) { + return __riscv_cv_alu_subRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subuRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR_I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SHFT_ADDR_I:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: store i8 0, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR_I]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[SHFT_ADDR_I]], align 1 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32 +// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.riscv.cv.alu.subuRN(i32 [[TMP2]], i32 [[TMP3]], i32 [[CONV_I]]) +// CHECK-NEXT: ret i32 [[TMP5]] +// +int test_alu_subuRN(uint32_t a, uint32_t b) { + return __riscv_cv_alu_subuRN(a, b, 0); +} diff --git a/clang/test/CodeGen/RISCV/riscv-xcvalu.c b/clang/test/CodeGen/RISCV/riscv-xcvalu.c new file mode 100644 index 0000000000000..e4c2a2c3ca28b --- /dev/null +++ b/clang/test/CodeGen/RISCV/riscv-xcvalu.c @@ -0,0 +1,249 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple riscv32 -target-feature +xcvalu -emit-llvm %s -o - \ +// RUN: | FileCheck %s + +#include + +// CHECK-LABEL: @test_abs( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[TMP0]], i1 true) +// CHECK-NEXT: ret i32 [[TMP1]] +// +int test_abs(int a) { + return __builtin_abs(a); +} + +// CHECK-LABEL: @test_alu_slet( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: [[SLE:%.*]] = zext i1 [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[SLE]] +// +int test_alu_slet(int32_t a, int32_t b) { + return __builtin_riscv_cv_alu_slet(a, b); +} + +// CHECK-LABEL: @test_alu_sletu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = icmp ule i32 [[TMP0]], [[TMP1]] +// CHECK-NEXT: [[SLEU:%.*]] = zext i1 [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[SLEU]] +// +int test_alu_sletu(uint32_t a, uint32_t b) { + return __builtin_riscv_cv_alu_sletu(a, b); +} + +// CHECK-LABEL: @test_alu_exths( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[CONV]] to i16 +// CHECK-NEXT: [[EXTHS:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTHS]] +// +int test_alu_exths(int16_t a) { + return __builtin_riscv_cv_alu_exths(a); +} + +// CHECK-LABEL: @test_alu_exthz( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[CONV]] to i16 +// CHECK-NEXT: [[EXTHZ:%.*]] = zext i16 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTHZ]] +// +int test_alu_exthz(uint16_t a) { + return __builtin_riscv_cv_alu_exthz(a); +} + +// CHECK-LABEL: @test_alu_extbs( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8 [[A:%.*]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[CONV]] to i8 +// CHECK-NEXT: [[EXTBS:%.*]] = sext i8 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTBS]] +// +int test_alu_extbs(int8_t a) { + return __builtin_riscv_cv_alu_extbs(a); +} + +// CHECK-LABEL: @test_alu_extbz( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store i8 [[A:%.*]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[CONV]] to i8 +// CHECK-NEXT: [[EXTBZ:%.*]] = zext i8 [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[EXTBZ]] +// +int test_alu_extbz(uint8_t a) { + return __builtin_riscv_cv_alu_extbz(a); +} + +// CHECK-LABEL: @test_alu_clip( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.cv.alu.clip(i32 [[TMP0]], i32 15) +// CHECK-NEXT: ret i32 [[TMP1]] +// +int test_alu_clip(int32_t a) { + return __builtin_riscv_cv_alu_clip(a, 15); +} + +// CHECK-LABEL: @test_alu_clipu( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.cv.alu.clipu(i32 [[TMP0]], i32 15) +// CHECK-NEXT: ret i32 [[TMP1]] +// +int test_alu_clipu(uint32_t a) { + return __builtin_riscv_cv_alu_clipu(a, 15); +} + +// CHECK-LABEL: @test_alu_addN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.addN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_addN(int32_t a, int32_t b) { + return __builtin_riscv_cv_alu_addN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_adduN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.adduN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_adduN(uint32_t a, uint32_t b) { + return __builtin_riscv_cv_alu_adduN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_addRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.addRN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_addRN(int32_t a, int32_t b) { + return __builtin_riscv_cv_alu_addRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_adduRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.adduRN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_adduRN(uint32_t a, uint32_t b) { + return __builtin_riscv_cv_alu_adduRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_subN(int32_t a, int32_t b) { + return __builtin_riscv_cv_alu_subN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subuN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subuN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_subuN(uint32_t a, uint32_t b) { + return __builtin_riscv_cv_alu_subuN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subRN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_subRN(int32_t a, int32_t b) { + return __builtin_riscv_cv_alu_subRN(a, b, 0); +} + +// CHECK-LABEL: @test_alu_subuRN( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.riscv.cv.alu.subuRN(i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int test_alu_subuRN(uint32_t a, uint32_t b) { + return __builtin_riscv_cv_alu_subuRN(a, b, 0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td index 38263f375c469..6e7e90438c621 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td @@ -61,14 +61,14 @@ let TargetPrefix = "riscv" in { def int_riscv_cv_alu_clip : ScalarCoreVAluGprGprIntrinsic; def int_riscv_cv_alu_clipu : ScalarCoreVAluGprGprIntrinsic; - def int_riscv_cv_alu_addn : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_addun : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_addrn : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_addurn : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_subn : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_subun : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_subrn : ScalarCoreVAluGprGprGprIntrinsic; - def int_riscv_cv_alu_suburn : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_addN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_adduN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_addRN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_adduRN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_subN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_subuN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_subRN : ScalarCoreVAluGprGprGprIntrinsic; + def int_riscv_cv_alu_subuRN : ScalarCoreVAluGprGprGprIntrinsic; def int_riscv_cv_mac_mac : ScalarCoreVMacGprGprGprIntrinsic; def int_riscv_cv_mac_msu : ScalarCoreVMacGprGprGprIntrinsic; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index b586b10192fff..b54baa16d9286 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -792,17 +792,18 @@ let Predicates = [HasVendorXCValu, IsRV32], AddedComplexity = 1 in { def : Pat<(sext_inreg (XLenVT GPR:$rs1), i16), (CV_EXTHS GPR:$rs1)>; def : Pat<(sext_inreg (XLenVT GPR:$rs1), i8), (CV_EXTBS GPR:$rs1)>; def : Pat<(and (XLenVT GPR:$rs1), 0xffff), (CV_EXTHZ GPR:$rs1)>; + def : Pat<(and (XLenVT GPR:$rs1), 0xff), (CV_EXTBZ GPR:$rs1)>; defm CLIP : PatCoreVAluGprImm; defm CLIPU : PatCoreVAluGprImm; - defm ADDN : PatCoreVAluGprGprImm; - defm ADDUN : PatCoreVAluGprGprImm; - defm ADDRN : PatCoreVAluGprGprImm; - defm ADDURN : PatCoreVAluGprGprImm; - defm SUBN : PatCoreVAluGprGprImm; - defm SUBUN : PatCoreVAluGprGprImm; - defm SUBRN : PatCoreVAluGprGprImm; - defm SUBURN : PatCoreVAluGprGprImm; + defm ADDN : PatCoreVAluGprGprImm; + defm ADDUN : PatCoreVAluGprGprImm; + defm ADDRN : PatCoreVAluGprGprImm; + defm ADDURN : PatCoreVAluGprGprImm; + defm SUBN : PatCoreVAluGprGprImm; + defm SUBUN : PatCoreVAluGprGprImm; + defm SUBRN : PatCoreVAluGprGprImm; + defm SUBURN : PatCoreVAluGprGprImm; } // Predicates = [HasVendorXCValu, IsRV32] //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/RISCV/xcvalu.ll b/llvm/test/CodeGen/RISCV/xcvalu.ll index 1ddfa102aca71..54634383dfdac 100644 --- a/llvm/test/CodeGen/RISCV/xcvalu.ll +++ b/llvm/test/CodeGen/RISCV/xcvalu.ll @@ -91,6 +91,26 @@ define i32 @exthz(i16 %a) { ret i32 %1 } +define i32 @extbs(i8 %a) { +; CHECK-LABEL: extbs: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $x11 killed $x10 +; CHECK-NEXT: cv.extbs a0, a0 +; CHECK-NEXT: ret + %1 = sext i8 %a to i32 + ret i32 %1 +} + +define i32 @extbz(i8 %a) { +; CHECK-LABEL: extbz: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $x11 killed $x10 +; CHECK-NEXT: cv.extbz a0, a0 +; CHECK-NEXT: ret + %1 = zext i8 %a to i32 + ret i32 %1 +} + declare i32 @llvm.riscv.cv.alu.clip(i32, i32) define i32 @test.cv.alu.clip.case.a(i32 %a) { @@ -133,170 +153,170 @@ define i32 @test.cv.alu.clipu.case.b(i32 %a) { ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.addn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.addN(i32, i32, i32) -define i32 @test.cv.alu.addn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addn.case.a: +define i32 @test.cv.alu.addN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.addN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.addn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.addN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.addn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addn.case.b: +define i32 @test.cv.alu.addN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.addN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.addnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.addN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.addun(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.adduN(i32, i32, i32) -define i32 @test.cv.alu.addun.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addun.case.a: +define i32 @test.cv.alu.adduN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.adduN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.addun a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addun(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.adduN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.addun.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addun.case.b: +define i32 @test.cv.alu.adduN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.adduN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.addunr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addun(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.adduN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.addrn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.addRN(i32, i32, i32) -define i32 @test.cv.alu.addrn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addrn.case.a: +define i32 @test.cv.alu.addRN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.addRN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.addrn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addrn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.addRN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.addrn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addrn.case.b: +define i32 @test.cv.alu.addRN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.addRN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.addrnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addrn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.addRN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.addurn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.adduRN(i32, i32, i32) -define i32 @test.cv.alu.addurn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addurn.case.a: +define i32 @test.cv.alu.adduRN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.adduRN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.addurn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addurn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.adduRN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.addurn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.addurn.case.b: +define i32 @test.cv.alu.adduRN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.adduRN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.addurnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.addurn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.adduRN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.subn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.subN(i32, i32, i32) -define i32 @test.cv.alu.subn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subn.case.a: +define i32 @test.cv.alu.subN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.subn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.subN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.subn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subn.case.b: +define i32 @test.cv.alu.subN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.subnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.subN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.subun(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.subuN(i32, i32, i32) -define i32 @test.cv.alu.subun.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subun.case.a: +define i32 @test.cv.alu.subuN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subuN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.subun a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subun(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.subuN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.subun.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subun.case.b: +define i32 @test.cv.alu.subuN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subuN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.subunr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subun(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.subuN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.subrn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.subRN(i32, i32, i32) -define i32 @test.cv.alu.subrn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subrn.case.a: +define i32 @test.cv.alu.subRN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subRN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.subrn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subrn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.subRN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.subrn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.subrn.case.b: +define i32 @test.cv.alu.subRN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subRN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.subrnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.subrn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.subRN(i32 %a, i32 %b, i32 32) ret i32 %1 } -declare i32 @llvm.riscv.cv.alu.suburn(i32, i32, i32) +declare i32 @llvm.riscv.cv.alu.subuRN(i32, i32, i32) -define i32 @test.cv.alu.suburn.case.a(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.suburn.case.a: +define i32 @test.cv.alu.subuRN.case.a(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subuRN.case.a: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.suburn a0, a0, a1, 15 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.suburn(i32 %a, i32 %b, i32 15) + %1 = call i32 @llvm.riscv.cv.alu.subuRN(i32 %a, i32 %b, i32 15) ret i32 %1 } -define i32 @test.cv.alu.suburn.case.b(i32 %a, i32 %b) { -; CHECK-LABEL: test.cv.alu.suburn.case.b: +define i32 @test.cv.alu.subuRN.case.b(i32 %a, i32 %b) { +; CHECK-LABEL: test.cv.alu.subuRN.case.b: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: cv.suburnr a0, a1, a2 ; CHECK-NEXT: ret - %1 = call i32 @llvm.riscv.cv.alu.suburn(i32 %a, i32 %b, i32 32) + %1 = call i32 @llvm.riscv.cv.alu.subuRN(i32 %a, i32 %b, i32 32) ret i32 %1 } From f86526cc4e12c794c510d5f049103a5b28b3f73f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 1 Oct 2024 01:25:20 +0000 Subject: [PATCH 043/151] [gn build] Port 00128a20eec2 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index cba7867854dff..1bbec962ff68d 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -280,6 +280,7 @@ copy("Headers") { "rdpruintrin.h", "rdseedintrin.h", "riscv_bitmanip.h", + "riscv_corev_alu.h", "riscv_crypto.h", "riscv_ntlh.h", "rtmintrin.h", From f2f9cdd22171f0c54cad7c6b183857f3d856c344 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Mon, 30 Sep 2024 21:45:53 -0400 Subject: [PATCH 044/151] [MLIR] Add test fort #110518 `cast`-to-`dyn_cast` fix (#110563) https://github.com/llvm/llvm-project/pull/110518 fixed assertion failures in `cast` introduced in https://github.com/llvm/llvm-project/pull/108450. Signed-off-by: Benoit Jacob --- mlir/test/Dialect/Affine/canonicalize.mlir | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index ff0e987bcef6c..906ae81c76d11 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1514,3 +1514,22 @@ func.func @drop_single_loop_delinearize(%arg0 : index, %arg1 : index) -> index { // CHECK: scf.for %[[IV:[a-zA-Z0-9]+]] = // CHECK-NOT: affine.delinearize_index // CHECK: "some_use"(%{{.+}}, %[[IV]]) + +// ----- + +// CHECK-LABEL: func @delinearize_non_induction_variable +func.func @delinearize_non_induction_variable(%arg0: memref, %i : index, %t0 : index, %t1 : index, %t2 : index) -> index { + %c1024 = arith.constant 1024 : index + %1 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%i)[%t0, %t1, %t2] + %2 = affine.delinearize_index %1 into (%c1024) : index + return %2 : index +} + +// ----- + +// CHECK-LABEL: func @delinearize_non_loop_like +func.func @delinearize_non_loop_like(%arg0: memref, %i : index) -> index { + %c1024 = arith.constant 1024 : index + %2 = affine.delinearize_index %i into (%c1024) : index + return %2 : index +} From 47d42cfa59b3f418b6f50504d258857abb04ac44 Mon Sep 17 00:00:00 2001 From: Pranav Bhandarkar Date: Mon, 30 Sep 2024 21:58:44 -0500 Subject: [PATCH 045/151] [mlir][OpenMP] - MLIR to LLVMIR translation support for delayed privatization in `omp.target` ops. (#109668) This patch adds support to translate the `private` clause on `omp.target` ops from MLIR to LLVMIR. This first cut only handles non-allocatables. Also, this is for delayed privatization. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 111 ++++++++++++++---- .../Target/LLVMIR/openmp-target-private.mlir | 99 ++++++++++++++++ 2 files changed, 187 insertions(+), 23 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-target-private.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index d788fe1f6165e..c22d9a189a7e0 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1356,6 +1356,41 @@ class OmpParallelOpConversionManager { unsigned privateArgEndIdx; }; +// Looks up from the operation from and returns the PrivateClauseOp with +// name symbolName +static omp::PrivateClauseOp findPrivatizer(Operation *from, + SymbolRefAttr symbolName) { + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom(from, + symbolName); + assert(privatizer && "privatizer not found in the symbol table"); + return privatizer; +} +// clones the given privatizer. The original privatizer is used as +// the insert point for the clone. +static omp::PrivateClauseOp +clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation, + omp::PrivateClauseOp privatizer, Operation *fromOperation) { + MLIRContext &context = moduleTranslation.getContext(); + mlir::IRRewriter opCloner(&context); + opCloner.setInsertionPoint(privatizer); + auto clone = + llvm::cast(opCloner.clone(*privatizer)); + + // Unique the clone name to avoid clashes in the symbol table. + unsigned counter = 0; + SmallString<256> cloneName = SymbolTable::generateSymbolName<256>( + privatizer.getSymName(), + [&](llvm::StringRef candidate) { + return SymbolTable::lookupNearestSymbolFrom( + fromOperation, StringAttr::get(&context, candidate)) != + nullptr; + }, + counter); + + clone.setSymName(cloneName); + return clone; +} /// Converts the OpenMP parallel operation to LLVM IR. static LogicalResult convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, @@ -1611,34 +1646,14 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, continue; SymbolRefAttr privSym = llvm::cast(mlirPrivatizerAttr); - omp::PrivateClauseOp privatizer = - SymbolTable::lookupNearestSymbolFrom( - opInst, privSym); + omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym); // Clone the privatizer in case it is used by more than one parallel // region. The privatizer is processed in-place (see below) before it // gets inlined in the parallel region and therefore processing the // original op is dangerous. - - MLIRContext &context = moduleTranslation.getContext(); - mlir::IRRewriter opCloner(&context); - opCloner.setInsertionPoint(privatizer); - auto clone = llvm::cast( - opCloner.clone(*privatizer)); - - // Unique the clone name to avoid clashes in the symbol table. - unsigned counter = 0; - SmallString<256> cloneName = SymbolTable::generateSymbolName<256>( - privatizer.getSymName(), - [&](llvm::StringRef candidate) { - return SymbolTable::lookupNearestSymbolFrom( - opInst, StringAttr::get(&context, candidate)) != - nullptr; - }, - counter); - - clone.setSymName(cloneName); - return {mlirPrivVar, clone}; + return {mlirPrivVar, + clonePrivatizer(moduleTranslation, privatizer, opInst)}; } } @@ -3434,6 +3449,56 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, const auto &arg = targetRegion.front().getArgument(argIndex); moduleTranslation.mapValue(arg, mapOpValue); } + + // Do privatization after moduleTranslation has already recorded + // mapped values. + if (!targetOp.getPrivateVars().empty()) { + builder.restoreIP(allocaIP); + + OperandRange privateVars = targetOp.getPrivateVars(); + std::optional privateSyms = targetOp.getPrivateSyms(); + unsigned numMapVars = targetOp.getMapVars().size(); + Block &firstTargetBlock = targetRegion.front(); + BlockArgument *blockArgsStart = firstTargetBlock.getArguments().begin(); + BlockArgument *privArgsStart = blockArgsStart + numMapVars; + BlockArgument *privArgsEnd = + privArgsStart + targetOp.getPrivateVars().size(); + MutableArrayRef privateBlockArgs(privArgsStart, privArgsEnd); + + for (auto [privVar, privatizerNameAttr, privBlockArg] : + llvm::zip_equal(privateVars, *privateSyms, privateBlockArgs)) { + + SymbolRefAttr privSym = llvm::cast(privatizerNameAttr); + omp::PrivateClauseOp privatizer = findPrivatizer(&opInst, privSym); + if (privatizer.getDataSharingType() == + omp::DataSharingClauseType::FirstPrivate || + !privatizer.getDeallocRegion().empty()) { + opInst.emitError("Translation of omp.target from MLIR to LLVMIR " + "failed because translation of firstprivate and " + " private allocatables is not supported yet"); + bodyGenStatus = failure(); + } else { + Region &allocRegion = privatizer.getAllocRegion(); + BlockArgument allocRegionArg = allocRegion.getArgument(0); + moduleTranslation.mapValue(allocRegionArg, + moduleTranslation.lookupValue(privVar)); + SmallVector yieldedValues; + if (failed(inlineConvertOmpRegions( + allocRegion, "omp.targetop.privatizer", builder, + moduleTranslation, &yieldedValues))) { + opInst.emitError( + "failed to inline `alloc` region of an `omp.private` " + "op in the target region"); + bodyGenStatus = failure(); + } else { + assert(yieldedValues.size() == 1); + moduleTranslation.mapValue(privBlockArg, yieldedValues.front()); + } + moduleTranslation.forgetMapping(allocRegion); + builder.restoreIP(builder.saveIP()); + } + } + } llvm::BasicBlock *exitBlock = convertOmpOpRegions( targetRegion, "omp.target", builder, moduleTranslation, bodyGenStatus); builder.SetInsertPoint(exitBlock); diff --git a/mlir/test/Target/LLVMIR/openmp-target-private.mlir b/mlir/test/Target/LLVMIR/openmp-target-private.mlir new file mode 100644 index 0000000000000..6480d4e2bff0b --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-private.mlir @@ -0,0 +1,99 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +omp.private {type = private} @simple_var.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var", pinned} : (i64) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @target_map_single_private() attributes {fir.internal_name = "_QPtarget_map_single_private"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr + %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(2 : i32) : i32 + llvm.store %4, %3 : i32, !llvm.ptr + %5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"} + omp.target map_entries(%5 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %6 = llvm.mlir.constant(10 : i32) : i32 + %7 = llvm.load %arg0 : !llvm.ptr -> i32 + %8 = llvm.add %7, %6 : i32 + llvm.store %8, %arg1 : i32, !llvm.ptr + omp.terminator + } + llvm.return +} +// CHECK: define internal void @__omp_offloading_ +// CHECK-NOT: define {{.*}} +// CHECK: %[[PRIV_ALLOC:.*]] = alloca i32, i64 1, align 4 +// CHECK: %[[ADD:.*]] = add i32 {{.*}}, 10 +// CHECK: store i32 %[[ADD]], ptr %[[PRIV_ALLOC]], align 4 + +omp.private {type = private} @n.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x f32 {bindc_name = "n", pinned} : (i64) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @target_map_2_privates() attributes {fir.internal_name = "_QPtarget_map_2_privates"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr + %3 = llvm.alloca %0 x f32 {bindc_name = "n"} : (i64) -> !llvm.ptr + %5 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr + %6 = llvm.mlir.constant(2 : i32) : i32 + llvm.store %6, %5 : i32, !llvm.ptr + %7 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"} + omp.target map_entries(%7 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr, @n.privatizer %3 -> %arg2 : !llvm.ptr) { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr): + %8 = llvm.mlir.constant(1.100000e+01 : f32) : f32 + %9 = llvm.mlir.constant(10 : i32) : i32 + %10 = llvm.load %arg0 : !llvm.ptr -> i32 + %11 = llvm.add %10, %9 : i32 + llvm.store %11, %arg1 : i32, !llvm.ptr + %12 = llvm.load %arg1 : !llvm.ptr -> i32 + %13 = llvm.sitofp %12 : i32 to f32 + %14 = llvm.fadd %13, %8 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %14, %arg2 : f32, !llvm.ptr + omp.terminator + } + llvm.return +} + + +// CHECK: define internal void @__omp_offloading_ +// CHECK: %[[PRIV_I32_ALLOC:.*]] = alloca i32, i64 1, align 4 +// CHECK: %[[PRIV_FLOAT_ALLOC:.*]] = alloca float, i64 1, align 4 +// CHECK: %[[ADD_I32:.*]] = add i32 {{.*}}, 10 +// CHECK: store i32 %[[ADD_I32]], ptr %[[PRIV_I32_ALLOC]], align 4 +// CHECK: %[[LOAD_I32_AGAIN:.*]] = load i32, ptr %[[PRIV_I32_ALLOC]], align 4 +// CHECK: %[[CAST_TO_FLOAT:.*]] = sitofp i32 %[[LOAD_I32_AGAIN]] to float +// CHECK: %[[ADD_FLOAT:.*]] = fadd contract float %[[CAST_TO_FLOAT]], 1.100000e+01 +// CHECK: store float %[[ADD_FLOAT]], ptr %[[PRIV_FLOAT_ALLOC]], align 4 + +// An entirely artifical privatizer that is meant to check multi-block +// privatizers. The idea here is to prove that we set the correct +// insertion points for the builder when generating, first, LLVM IR for the +// privatizer and then for the actual target region. +omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %c1 = llvm.mlir.constant(1 : i32) : i32 + llvm.br ^bb1(%c1 : i32) + +^bb1(%arg1: i32): + %0 = llvm.alloca %arg1 x f32 : (i32) -> !llvm.ptr + omp.yield(%0 : !llvm.ptr) +} + +llvm.func @target_op_private_multi_block(%arg0: !llvm.ptr) { + omp.target private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) { + ^bb0(%arg2: !llvm.ptr): + %0 = llvm.load %arg2 : !llvm.ptr -> f32 + omp.terminator + } + llvm.return +} +// CHECK: define internal void @__omp_offloading_ +// CHECK: %[[ONE:.*]] = phi i32 [ 1, {{.*}} ] +// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[ONE]], align 4 +// CHECK: %[[PHI_ALLOCA:.*]] = phi ptr [ %[[PRIV_ALLOC]], {{.*}} ] +// CHECK: %[[RESULT:.*]] = load float, ptr %[[PHI_ALLOCA]], align 4 From 463a4f15044c04279583d6d0da73ae49f4c242ec Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Tue, 1 Oct 2024 12:28:30 +0800 Subject: [PATCH 046/151] [Clang][Concepts] Normalize SizeOfPackExpr's pack declaration (#110238) SizeOfPackExpr has a pointer to the referenced pack declaration, which is left as-is during the transformation process. The situation could be subtle when a friend class template declaration comes into play. The declaration per se would be instantiated into its parent declaration context, and consequently, the template parameter list would have a depth adjustment; however, as we don't evaluate constraints during instantiation, those constraints would still reference the original template parameters, which is fine for constraint evaluation because we have handled friend cases in the template argument collection. However, things are different when we want to profile the constraint expression with dependent template arguments. The hash algorithm of SizeOfPackExpr takes its pack declaration as a factor, which is the original template parameter that might still have untransformed template depths after the constraint normalization. This patch transforms the pack declaration when normalizing constraint expressions and pluses a fix in HandleFunctionTemplateDecl() where the associated declaration is incorrect for nested specifiers. Note that the fix in HandleFunctionTemplateDecl(), as well as the handling logic for NestedNameSpecifier, would be removed once Krystian's refactoring patch lands. But I still want to incorporate it in the patch for the correction purpose, though it hasn't caused any problems so far - I just tripped over that in getFullyPackExpandedSize() when I tried to extract the transformed declarations from the TemplateArgument. Fixes #93099 --------- Co-authored-by: Matheus Izvekov --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaConcept.cpp | 14 +++++--- clang/lib/Sema/SemaTemplateInstantiate.cpp | 29 +++++++++++++++- .../SemaTemplate/concepts-out-of-line-def.cpp | 34 +++++++++++++++++++ 4 files changed, 73 insertions(+), 6 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a7c1bb80a49db..6a1e60b9b5097 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -453,6 +453,8 @@ Bug Fixes to C++ Support - Mangle friend function templates with a constraint that depends on a template parameter from an enclosing template as members of the enclosing class. (#GH110247) - Fixed an issue in constraint evaluation, where type constraints on the lambda expression containing outer unexpanded parameters were not correctly expanded. (#GH101754) +- Fixed a bug in constraint expression comparison where the ``sizeof...`` expression was not handled properly + in certain friend declarations. (#GH93099) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 6a1b32598bb4a..67fc603e9ce1d 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -975,11 +975,14 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( // parameters that the surrounding function hasn't been instantiated yet. Note // this may happen while we're comparing two templates' constraint // equivalence. - LocalInstantiationScope ScopeForParameters(S); - if (auto *FD = DeclInfo.getDecl()->getAsFunction()) + std::optional ScopeForParameters; + if (const NamedDecl *ND = DeclInfo.getDecl(); + ND && ND->isFunctionOrFunctionTemplate()) { + ScopeForParameters.emplace(S); + const FunctionDecl *FD = ND->getAsFunction(); for (auto *PVD : FD->parameters()) { if (!PVD->isParameterPack()) { - ScopeForParameters.InstantiatedLocal(PVD, PVD); + ScopeForParameters->InstantiatedLocal(PVD, PVD); continue; } // This is hacky: we're mapping the parameter pack to a size-of-1 argument @@ -998,9 +1001,10 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( // that we can eliminate the Scope in the cases where the declarations are // not necessarily instantiated. It would also benefit the noexcept // specifier comparison. - ScopeForParameters.MakeInstantiatedLocalArgPack(PVD); - ScopeForParameters.InstantiatedLocalPackArg(PVD, PVD); + ScopeForParameters->MakeInstantiatedLocalArgPack(PVD); + ScopeForParameters->InstantiatedLocalPackArg(PVD, PVD); } + } std::optional ThisScope; diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index e874ab563e2f8..b36381422851f 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -371,7 +371,7 @@ Response HandleFunctionTemplateDecl(const FunctionTemplateDecl *FTD, Specialization->getTemplateInstantiationArgs().asArray(); } Result.addOuterTemplateArguments( - const_cast(FTD), Arguments, + TSTy->getTemplateName().getAsTemplateDecl(), Arguments, /*Final=*/false); } } @@ -1737,6 +1737,33 @@ namespace { return inherited::TransformLambdaBody(E, Body); } + ExprResult RebuildSizeOfPackExpr(SourceLocation OperatorLoc, + NamedDecl *Pack, SourceLocation PackLoc, + SourceLocation RParenLoc, + std::optional Length, + ArrayRef PartialArgs) { + if (SemaRef.CodeSynthesisContexts.back().Kind != + Sema::CodeSynthesisContext::ConstraintNormalization) + return inherited::RebuildSizeOfPackExpr(OperatorLoc, Pack, PackLoc, + RParenLoc, Length, PartialArgs); + +#ifndef NDEBUG + for (auto *Iter = TemplateArgs.begin(); Iter != TemplateArgs.end(); + ++Iter) + for (const TemplateArgument &TA : Iter->Args) + assert(TA.getKind() != TemplateArgument::Pack || TA.pack_size() == 1); +#endif + Sema::ArgumentPackSubstitutionIndexRAII SubstIndex( + SemaRef, /*NewSubstitutionIndex=*/0); + Decl *NewPack = TransformDecl(PackLoc, Pack); + if (!NewPack) + return ExprError(); + + return inherited::RebuildSizeOfPackExpr(OperatorLoc, + cast(NewPack), PackLoc, + RParenLoc, Length, PartialArgs); + } + ExprResult TransformRequiresExpr(RequiresExpr *E) { LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); ExprResult TransReq = inherited::TransformRequiresExpr(E); diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp index 5450d105a6f54..8ca399a0f729a 100644 --- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp +++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp @@ -666,3 +666,37 @@ int foo() { } } // namespace eve + +namespace GH93099 { + +// Issues with sizeof...(expr) + +template struct C { + template + requires(sizeof...(N) > 0) + friend class NTTP; + + template + requires(sizeof...(Tp) > 0) + friend class TP; + + template