From 6a214ec1eeef6b404bf111edeca13c6e0d958103 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 14 Jan 2025 13:46:34 -0800 Subject: [PATCH 01/82] [MemProf] Fix an assertion when writing distributed index for aliasee (#122946) The ThinLTO index bitcode writer uses a helper forEachSummary to manage preparation and writing of summaries needed for each distributed index file. For alias summaries, it invokes the provided callback for the aliasee as well, as we at least need to produce a value id for the alias's summary. However, all summary generation for the aliasee itself should be skipped on calls when IsAliasee is true. We invoke the callback again if that value's summary is to be written as well. We were asserting in debug mode when invoking collectMemProfCallStacks, because a given stack id index was not in the StackIdIndicesToIndex map. It was not added because the forEachSummary invocation that records these ids in the map (invoked from the IndexBitcodeWriter constructor) was correctly skipping this handling when invoked for aliasees. We need the same guard in the invocation that calls collectMemProfCallStacks. Note that this doesn't cause any real problems in a non-asserts build as the missing map lookup will return the default 0 value from the map, which isn't used since we don't actually write the corresponding summary. --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 8 ++++++++ llvm/test/ThinLTO/X86/memprof_direct_recursion.ll | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 94d3afa6c1e33..31c96400dd0fe 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -494,6 +494,9 @@ class IndexBitcodeWriter : public BitcodeWriterBase { // are currently saved in the index in terms of GUID. forEachSummary([&](GVInfo I, bool IsAliasee) { GUIDToValueIdMap[I.first] = ++GlobalValueId; + // If this is invoked for an aliasee, we want to record the above mapping, + // but not the information needed for its summary entry (if the aliasee is + // to be imported, we will invoke this separately with IsAliasee=false). if (IsAliasee) return; auto *FS = dyn_cast(I.second); @@ -4847,6 +4850,11 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // radix tree array are identified based on this order. MapVector> CallStacks; forEachSummary([&](GVInfo I, bool IsAliasee) { + // Don't collect this when invoked for an aliasee, as it is not needed for + // the alias summary. If the aliasee is to be imported, we will invoke this + // separately with IsAliasee=false. + if (IsAliasee) + return; GlobalValueSummary *S = I.second; assert(S); auto *FS = dyn_cast(S); diff --git a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll index 102ee64d4638d..63139cacd8fba 100644 --- a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll +++ b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll @@ -34,6 +34,7 @@ ; RUN: -supports-hot-cold-new \ ; RUN: -thinlto-distributed-indexes \ ; RUN: -r=%t/b.o,_Z3fooi,plx \ +; RUN: -r=%t/b.o,aliasee,plx \ ; RUN: -r=%t/b.o,a \ ; RUN: -r=%t/b.o,b \ ; RUN: -r=%t/b.o,_Znam \ @@ -65,11 +66,15 @@ source_filename = "b.cpp" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +;; Make sure the distributed summary bitcode writing succeeds when the memprof +;; metadata is in an aliasee. +@_Z3fooi = alias void (), ptr @aliasee + @a = external local_unnamed_addr global ptr, align 8 @b = external local_unnamed_addr global i32, align 4 ; Function Attrs: mustprogress uwtable -define dso_local void @_Z3fooi(i32 noundef %0) local_unnamed_addr #0 !dbg !9 { +define dso_local void @aliasee(i32 noundef %0) local_unnamed_addr #0 !dbg !9 { br label %2, !dbg !12 2: ; preds = %7, %1 @@ -222,4 +227,4 @@ attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "t !19 = !DILocation(line: 7, column: 5, scope: !9) !20 = !{i64 8256520048276991898} !21 = !DILocation(line: 8, column: 5, scope: !9) -!22 = !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) \ No newline at end of file +!22 = !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) From 1682deed0fd02c6aca98154e8e9cf6c573ff6d45 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 14 Jan 2025 13:57:44 -0800 Subject: [PATCH 02/82] [libclang] Add API to query more information about base classes. (#120300) The first API is clang_visitCXXBaseClasses: this allows visiting the base classes without going through the generic child visitor (which is awkward, and doesn't work for template instantiations). The second API is clang_getOffsetOfBase; this allows computing the offset of a base in the class layout, the same way clang_Cursor_getOffsetOfField computes the offset of a field. Also, add a Python binding for the existing function clang_isVirtualBase. --- clang/bindings/python/clang/cindex.py | 25 +++++++++++++ .../bindings/python/tests/cindex/test_type.py | 25 +++++++++++++ clang/docs/ReleaseNotes.rst | 10 ++++++ clang/include/clang-c/Index.h | 36 +++++++++++++++++-- clang/tools/libclang/CIndexCXX.cpp | 27 ++++++++++++++ clang/tools/libclang/CXType.cpp | 34 ++++++++++++++++++ clang/tools/libclang/libclang.map | 2 ++ 7 files changed, 157 insertions(+), 2 deletions(-) diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 710259de855f9..806e1b40f3c9e 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -2133,6 +2133,14 @@ def get_field_offsetof(self): """Returns the offsetof the FIELD_DECL pointed by this Cursor.""" return conf.lib.clang_Cursor_getOffsetOfField(self) # type: ignore [no-any-return] + def get_base_offsetof(self, parent): + """Returns the offsetof the CXX_BASE_SPECIFIER pointed by this Cursor.""" + return conf.lib.clang_getOffsetOfBase(parent, self) # type: ignore [no-any-return] + + def is_virtual_base(self): + """Returns whether the CXX_BASE_SPECIFIER pointed by this Cursor is virtual.""" + return conf.lib.clang_isVirtualBase(self) # type: ignore [no-any-return] + def is_anonymous(self): """ Check whether this is a record type without a name, or a field where @@ -2687,6 +2695,21 @@ def visitor(field, children): conf.lib.clang_Type_visitFields(self, fields_visit_callback(visitor), fields) return iter(fields) + def get_bases(self): + """Return an iterator for accessing the base classes of this type.""" + + def visitor(base, children): + assert base != conf.lib.clang_getNullCursor() + + # Create reference to TU so it isn't GC'd before Cursor. + base._tu = self._tu + bases.append(base) + return 1 # continue + + bases: list[Cursor] = [] + conf.lib.clang_visitCXXBaseClasses(self, fields_visit_callback(visitor), bases) + return iter(bases) + def get_exception_specification_kind(self): """ Return the kind of the exception specification; a value from @@ -3940,6 +3963,7 @@ def set_property(self, property, value): ("clang_getNumDiagnosticsInSet", [c_object_p], c_uint), ("clang_getNumElements", [Type], c_longlong), ("clang_getNumOverloadedDecls", [Cursor], c_uint), + ("clang_getOffsetOfBase", [Cursor, Cursor], c_longlong), ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor), ("clang_getPointeeType", [Type], Type), ("clang_getRange", [SourceLocation, SourceLocation], SourceRange), @@ -3992,6 +4016,7 @@ def set_property(self, property, value): [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)], ), ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint), + ("clang_visitCXXBaseClasses", [Type, fields_visit_callback, py_object], c_uint), ("clang_Cursor_getNumArguments", [Cursor], c_int), ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor), ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int), diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py index f39da8b5faf29..db7dc6458581e 100644 --- a/clang/bindings/python/tests/cindex/test_type.py +++ b/clang/bindings/python/tests/cindex/test_type.py @@ -534,3 +534,28 @@ def test_pretty(self): self.assertEqual(f.type.get_canonical().pretty_printed(pp), "X") pp.set_property(PrintingPolicyProperty.SuppressTagKeyword, False) self.assertEqual(f.type.get_canonical().pretty_printed(pp), "struct X") + + def test_base_classes(self): + source = """ + class A { int a; }; + class B { int b; }; + class C { int c; }; + template + class Template : public A, public B, virtual C { + }; + Template instance; + int bar; + """ + tu = get_tu(source, lang="cpp") + cursor = get_cursor(tu, "instance") + cursor_type = cursor.type + cursor_type_decl = cursor_type.get_declaration() + self.assertEqual(cursor.kind, CursorKind.VAR_DECL) + bases = list(cursor_type.get_bases()) + self.assertEqual(len(bases), 3) + self.assertFalse(bases[0].is_virtual_base()) + self.assertEqual(bases[0].get_base_offsetof(cursor_type_decl), 64) + self.assertFalse(bases[1].is_virtual_base()) + self.assertEqual(bases[1].get_base_offsetof(cursor_type_decl), 96) + self.assertTrue(bases[2].is_virtual_base()) + self.assertEqual(bases[2].get_base_offsetof(cursor_type_decl), 128) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 61aa955ca9b9d..6ac91f43e66d8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1224,6 +1224,10 @@ libclang whether the first one comes strictly before the second in the source code. - Add ``clang_getTypePrettyPrinted``. It allows controlling the PrintingPolicy used to pretty-print a type. +- Added ``clang_visitCXXBaseClasses``, which allows visiting the base classes + of a class. +- Added ``clang_getOffsetOfBase``, which allows computing the offset of a base + class in a class's layout. Static Analyzer --------------- @@ -1371,6 +1375,12 @@ Python Binding Changes declaration is an anonymous union or anonymous struct. - Added ``Type.pretty_printed`, a binding for ``clang_getTypePrettyPrinted``, which allows changing the formatting of pretty-printed types. +- Added ``Cursor.is_virtual_base``, a binding for ``clang_isVirtualBase``, + which checks whether a base class is virtual. +- Added ``Type.get_bases``, a binding for ``clang_visitCXXBaseClasses``, which + allows visiting the base classes of a class. +- Added ``Cursor.get_base_offsetof``, a binding for ``clang_getOffsetOfBase``, + which allows computing the offset of a base class in a class's layout. OpenMP Support -------------- diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index ad64497ceb802..aac5d1fa8aa2e 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -3605,8 +3605,8 @@ CINDEX_LINKAGE enum CXTypeNullabilityKind clang_Type_getNullability(CXType T); /** * List the possible error codes for \c clang_Type_getSizeOf, - * \c clang_Type_getAlignOf, \c clang_Type_getOffsetOf and - * \c clang_Cursor_getOffsetOf. + * \c clang_Type_getAlignOf, \c clang_Type_getOffsetOf, + * \c clang_Cursor_getOffsetOf, and \c clang_getOffsetOfBase. * * A value of this enumeration type can be returned if the target type is not * a valid argument to sizeof, alignof or offsetof. @@ -3771,6 +3771,15 @@ CINDEX_LINKAGE enum CXRefQualifierKind clang_Type_getCXXRefQualifier(CXType T); */ CINDEX_LINKAGE unsigned clang_isVirtualBase(CXCursor); +/** + * Returns the offset in bits of a CX_CXXBaseSpecifier relative to the parent + * class. + * + * Returns a small negative number if the offset cannot be computed. See + * CXTypeLayoutError for error codes. + */ +CINDEX_LINKAGE long long clang_getOffsetOfBase(CXCursor Parent, CXCursor Base); + /** * Represents the C++ access control level to a base class for a * cursor with kind CX_CXXBaseSpecifier. @@ -6648,6 +6657,29 @@ typedef enum CXVisitorResult (*CXFieldVisitor)(CXCursor C, CINDEX_LINKAGE unsigned clang_Type_visitFields(CXType T, CXFieldVisitor visitor, CXClientData client_data); +/** + * Visit the base classes of a type. + * + * This function visits all the direct base classes of a the given cursor, + * invoking the given \p visitor function with the cursors of each + * visited base. The traversal may be ended prematurely, if + * the visitor returns \c CXFieldVisit_Break. + * + * \param T the record type whose field may be visited. + * + * \param visitor the visitor function that will be invoked for each + * field of \p T. + * + * \param client_data pointer data supplied by the client, which will + * be passed to the visitor each time it is invoked. + * + * \returns a non-zero value if the traversal was terminated + * prematurely by the visitor returning \c CXFieldVisit_Break. + */ +CINDEX_LINKAGE unsigned clang_visitCXXBaseClasses(CXType T, + CXFieldVisitor visitor, + CXClientData client_data); + /** * Describes the kind of binary operators. */ diff --git a/clang/tools/libclang/CIndexCXX.cpp b/clang/tools/libclang/CIndexCXX.cpp index a1be70dde9f67..8b84fdc22ecff 100644 --- a/clang/tools/libclang/CIndexCXX.cpp +++ b/clang/tools/libclang/CIndexCXX.cpp @@ -27,6 +27,33 @@ unsigned clang_isVirtualBase(CXCursor C) { return B->isVirtual(); } +unsigned clang_visitCXXBaseClasses(CXType PT, CXFieldVisitor visitor, + CXClientData client_data) { + CXCursor PC = clang_getTypeDeclaration(PT); + if (clang_isInvalid(PC.kind)) + return false; + const CXXRecordDecl *RD = + dyn_cast_if_present(cxcursor::getCursorDecl(PC)); + if (!RD || RD->isInvalidDecl()) + return false; + RD = RD->getDefinition(); + if (!RD || RD->isInvalidDecl()) + return false; + + for (auto &Base : RD->bases()) { + // Callback to the client. + switch ( + visitor(cxcursor::MakeCursorCXXBaseSpecifier(&Base, getCursorTU(PC)), + client_data)) { + case CXVisit_Break: + return true; + case CXVisit_Continue: + break; + } + } + return true; +} + enum CX_CXXAccessSpecifier clang_getCXXAccessSpecifier(CXCursor C) { AccessSpecifier spec = AS_none; diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp index f1b661435c499..5da87c6f4aa9c 100644 --- a/clang/tools/libclang/CXType.cpp +++ b/clang/tools/libclang/CXType.cpp @@ -19,6 +19,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" +#include "clang/AST/RecordLayout.h" #include "clang/AST/Type.h" #include "clang/Basic/AddressSpaces.h" #include "clang/Frontend/ASTUnit.h" @@ -1108,6 +1109,39 @@ long long clang_Cursor_getOffsetOfField(CXCursor C) { return -1; } +long long clang_getOffsetOfBase(CXCursor Parent, CXCursor Base) { + if (Base.kind != CXCursor_CXXBaseSpecifier) + return -1; + + if (!clang_isDeclaration(Parent.kind)) + return -1; + + // we need to validate the parent type + CXType PT = clang_getCursorType(Parent); + long long Error = validateFieldParentType(Parent, PT); + if (Error < 0) + return Error; + + const CXXRecordDecl *ParentRD = + dyn_cast(cxcursor::getCursorDecl(Parent)); + if (!ParentRD) + return -1; + + ASTContext &Ctx = cxcursor::getCursorContext(Base); + const CXXBaseSpecifier *B = cxcursor::getCursorCXXBaseSpecifier(Base); + if (ParentRD->bases_begin() > B || ParentRD->bases_end() <= B) + return -1; + + const CXXRecordDecl *BaseRD = B->getType()->getAsCXXRecordDecl(); + if (!BaseRD) + return -1; + + const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(ParentRD); + if (B->isVirtual()) + return Ctx.toBits(Layout.getVBaseClassOffset(BaseRD)); + return Ctx.toBits(Layout.getBaseClassOffset(BaseRD)); +} + enum CXRefQualifierKind clang_Type_getCXXRefQualifier(CXType T) { QualType QT = GetQualType(T); if (QT.isNull()) diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map index 00ba56ab3c79d..8ca8a58b76d9e 100644 --- a/clang/tools/libclang/libclang.map +++ b/clang/tools/libclang/libclang.map @@ -436,8 +436,10 @@ LLVM_19 { LLVM_20 { global: + clang_getOffsetOfBase; clang_getTypePrettyPrinted; clang_isBeforeInTranslationUnit; + clang_visitCXXBaseClasses; }; # Example of how to add a new symbol version entry. If you do add a new symbol From 25f28ddd69ed2453726c0934ba6feea8ae6f10f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 14 Jan 2025 14:05:57 -0800 Subject: [PATCH 03/82] [flang][cuda][NFC] Fix file header --- flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 2e6c272fa9089..5ce39f99bbb12 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -1,4 +1,4 @@ -//===-- CUFOpConversion.cpp -----------------------------------------------===// +//===-- CUFDeviceGlobal.cpp -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 1de3dc7d23dd6b856efad3a3a04f2396328726d7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 14 Jan 2025 22:07:38 +0000 Subject: [PATCH 04/82] [LV] Bail out early if BTC+1 wraps. Currently we fail to detect the case where BTC + 1 wraps, i.e. the vector trip count is 0, In those cases, the minimum iteration count check will fail, and the vector code will never be executed. Explicitly check for this condition in computeMaxVF and avoid trying to vectorize alltogether. Note that a number of tests needed to be updated, because the vector loop would never be executed given the input IR. Fixes https://github.com/llvm/llvm-project/issues/122558. --- .../Transforms/Vectorize/LoopVectorize.cpp | 19 +- .../AArch64/loopvectorize_pr33804_double.ll | 8 +- .../partial-reduce-dot-product-epilogue.ll | 47 +- .../partial-reduce-dot-product-mixed.ll | 127 +- .../partial-reduce-dot-product-neon.ll | 1629 ++++++++++++++++- .../AArch64/partial-reduce-dot-product.ll | 1162 +++++++++++- .../AArch64/partial-reduce-no-dotprod.ll | 6 +- .../LoopVectorize/AArch64/store-costs-sve.ll | 107 +- .../LoopVectorize/AArch64/strict-fadd.ll | 2 +- .../AArch64/sve-widen-extractvalue.ll | 2 +- .../LoopVectorize/AArch64/vplan-printing.ll | 8 +- .../LoopVectorize/Hexagon/maximum-vf-crash.ll | 8 +- .../X86/cost-constant-known-via-scev.ll | 10 +- .../LoopVectorize/X86/cost-model.ll | 42 +- .../X86/drop-poison-generating-flags.ll | 12 +- .../LoopVectorize/X86/interleave-cost.ll | 85 +- .../X86/replicate-uniform-call.ll | 10 +- ...og-vectorization-vector-trip-count-zero.ll | 35 + .../first-order-recurrence-chains.ll | 6 +- .../LoopVectorize/if-pred-stores.ll | 78 +- .../Transforms/LoopVectorize/induction.ll | 93 +- .../interleave-and-scalarize-only.ll | 4 +- .../Transforms/LoopVectorize/is_fpclass.ll | 12 +- .../optimal-epilog-vectorization.ll | 119 ++ .../LoopVectorize/runtime-checks-hoist.ll | 24 +- .../LoopVectorize/scev-predicate-reasoning.ll | 81 +- .../vplan-sink-scalars-and-merge-vf1.ll | 12 +- 27 files changed, 3247 insertions(+), 501 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 744faef192438..fe2fb5e9faaea 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4052,7 +4052,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + ScalarEvolution *SE = PSE.getSE(); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); if (TC != MaxTC) @@ -4064,6 +4065,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } + // If BTC matches the widest induction type and is -1 then the trip count + // computation will wrap to 0 and the vector trip count will be 0. Do not try + // to vectorize. + const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); + if (!isa(BTC) && + BTC->getType()->getScalarSizeInBits() >= + Legal->getWidestInductionType()->getScalarSizeInBits() && + SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, + SE->getMinusOne(BTC->getType()))) { + reportVectorizationFailure( + "Trip count computation wrapped", + "backedge-taken count is -1, loop trip count wrapped to 0", + "TripCountWrapped", ORE, TheLoop); + return FixedScalableVFPair::getNone(); + } + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(MaxTC, UserVF, false); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll index 0f3db228e9cfe..8da4f0c456b6b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll @@ -26,7 +26,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %next19.i.i = getelementptr inbounds %struct.CvNode1D, ptr %dst, i32 %i.1424.i.i, i32 1 store ptr %dst, ptr %next19.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -52,7 +52,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %val.i.i = getelementptr inbounds %struct.CvNode1D2, ptr %arrayidx15.i.i1427, i32 0, i32 1 store double 0xC415AF1D80000000, ptr %val.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -79,7 +79,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store double %load_d, ptr %dst.ptr, align 4 store ptr %load_p, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -107,7 +107,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store double %load_d, ptr %dst.ptr, align 4 store ptr %load_p, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 5cc00daab7ce5..37c489cd0d4cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -10,10 +10,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -31,7 +31,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) @@ -42,12 +42,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 @@ -71,8 +71,29 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -89,7 +110,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -211,3 +232,13 @@ while.end.loopexit: ; preds = %while.body attributes #0 = { vscale_range(1,16) "target-features"="+sve" } attributes #1 = { "target-cpu"="apple-m1" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 74db8683d5df8..5c7ea8efa7ed7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -9,7 +9,7 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp_z_s( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -36,17 +36,38 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-NOI8MM-LABEL: define i32 @dotp_z_s( ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NOI8MM-NEXT: entry: -; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: @@ -73,12 +94,33 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOI8MM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NOI8MM-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOI8MM: for.body: +; CHECK-NOI8MM-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NOI8MM-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-NOI8MM-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NOI8MM-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NOI8MM-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOI8MM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NOI8MM-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -95,7 +137,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -104,9 +146,9 @@ for.exit: ; preds = %for.body define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp_s_z( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -133,17 +175,38 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-NOI8MM-LABEL: define i32 @dotp_s_z( -; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NOI8MM-NEXT: entry: -; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: @@ -170,12 +233,33 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOI8MM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NOI8MM-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOI8MM: for.body: +; CHECK-NOI8MM-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-NOI8MM-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NOI8MM-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NOI8MM-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NOI8MM-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOI8MM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NOI8MM-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -192,7 +276,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -204,3 +288,18 @@ for.exit: ; preds = %for.body !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-NOI8MM: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-NOI8MM: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NOI8MM: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NOI8MM: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-NOI8MM: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-NOI8MM: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index c66695f1b50f0..97a5801d88108 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -10,7 +10,7 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -28,16 +28,37 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -64,17 +85,38 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -92,11 +134,32 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -113,7 +176,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -124,7 +187,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -202,13 +265,37 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -357,13 +444,38 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP140]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP140]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -441,8 +553,32 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -459,7 +595,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -470,7 +606,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -489,13 +625,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -514,13 +675,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -539,8 +725,33 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -557,7 +768,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -568,7 +779,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -586,13 +797,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -610,13 +846,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -634,8 +895,33 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -652,7 +938,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %ext.b %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -729,6 +1015,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -799,6 +1145,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVED-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -869,6 +1275,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-MAXBW-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-MAXBW-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-MAXBW-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-MAXBW-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-MAXBW-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-MAXBW-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-MAXBW-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -956,6 +1422,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -997,6 +1484,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1028,6 +1536,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1087,6 +1616,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-INTERLEAVE1: pred.load.if: +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-INTERLEAVE1: pred.load.continue: +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-INTERLEAVE1: pred.load.if1: +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-INTERLEAVE1: pred.load.continue2: +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-INTERLEAVE1: pred.load.if3: +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-INTERLEAVE1: pred.load.continue4: +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-INTERLEAVE1: pred.load.if5: +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-INTERLEAVE1: pred.load.continue6: +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-INTERLEAVE1: pred.load.if7: +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-INTERLEAVE1: pred.load.continue8: +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-INTERLEAVE1: pred.load.if9: +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-INTERLEAVE1: pred.load.continue10: +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-INTERLEAVE1: pred.load.if11: +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-INTERLEAVE1: pred.load.continue12: +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-INTERLEAVE1: pred.load.if13: +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-INTERLEAVE1: pred.load.continue14: +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-INTERLEAVE1: pred.load.if15: +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-INTERLEAVE1: pred.load.continue16: +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-INTERLEAVE1: pred.load.if17: +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-INTERLEAVE1: pred.load.continue18: +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-INTERLEAVE1: pred.load.if19: +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-INTERLEAVE1: pred.load.continue20: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-INTERLEAVE1: pred.load.if21: +; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-INTERLEAVE1: pred.load.continue22: +; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-INTERLEAVE1: pred.load.if23: +; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-INTERLEAVE1: pred.load.continue24: +; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-INTERLEAVE1: pred.load.if25: +; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-INTERLEAVE1: pred.load.continue26: +; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-INTERLEAVE1: pred.load.if27: +; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-INTERLEAVE1: pred.load.continue28: +; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVE1: pred.load.if29: +; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-INTERLEAVE1: pred.load.continue30: +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-INTERLEAVE1: pred.load.if31: +; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-INTERLEAVE1: pred.load.continue32: +; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-INTERLEAVE1: pred.load.if33: +; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-INTERLEAVE1: pred.load.continue34: +; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-INTERLEAVE1: pred.load.if35: +; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-INTERLEAVE1: pred.load.continue36: +; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-INTERLEAVE1: pred.load.if37: +; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-INTERLEAVE1: pred.load.continue38: +; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-INTERLEAVE1: pred.load.if39: +; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-INTERLEAVE1: pred.load.continue40: +; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-INTERLEAVE1: pred.load.if41: +; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-INTERLEAVE1: pred.load.continue42: +; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-INTERLEAVE1: pred.load.if43: +; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-INTERLEAVE1: pred.load.continue44: +; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-INTERLEAVE1: pred.load.if45: +; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-INTERLEAVE1: pred.load.continue46: +; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-INTERLEAVE1: pred.load.if47: +; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-INTERLEAVE1: pred.load.continue48: +; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-INTERLEAVE1: pred.load.if49: +; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-INTERLEAVE1: pred.load.continue50: +; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-INTERLEAVE1: pred.load.if51: +; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-INTERLEAVE1: pred.load.continue52: +; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-INTERLEAVE1: pred.load.if53: +; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-INTERLEAVE1: pred.load.continue54: +; CHECK-INTERLEAVE1-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-INTERLEAVE1: pred.load.if55: +; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-INTERLEAVE1: pred.load.continue56: +; CHECK-INTERLEAVE1-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-INTERLEAVE1: pred.load.if57: +; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-INTERLEAVE1: pred.load.continue58: +; CHECK-INTERLEAVE1-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-INTERLEAVE1: pred.load.if59: +; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-INTERLEAVE1: pred.load.continue60: +; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVE1: pred.load.if61: +; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVE1: pred.load.continue62: +; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-INTERLEAVE1-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1123,6 +1971,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-INTERLEAVED: pred.load.if: +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-INTERLEAVED: pred.load.continue: +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-INTERLEAVED: pred.load.if1: +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-INTERLEAVED: pred.load.continue2: +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-INTERLEAVED: pred.load.if3: +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-INTERLEAVED: pred.load.continue4: +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-INTERLEAVED: pred.load.if5: +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-INTERLEAVED: pred.load.continue6: +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-INTERLEAVED: pred.load.if7: +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-INTERLEAVED: pred.load.continue8: +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-INTERLEAVED: pred.load.if9: +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-INTERLEAVED: pred.load.continue10: +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-INTERLEAVED: pred.load.if11: +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-INTERLEAVED: pred.load.continue12: +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-INTERLEAVED: pred.load.if13: +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-INTERLEAVED: pred.load.continue14: +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-INTERLEAVED: pred.load.if15: +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-INTERLEAVED: pred.load.continue16: +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-INTERLEAVED: pred.load.if17: +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-INTERLEAVED: pred.load.continue18: +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-INTERLEAVED: pred.load.if19: +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-INTERLEAVED: pred.load.continue20: +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-INTERLEAVED: pred.load.if21: +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-INTERLEAVED: pred.load.continue22: +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-INTERLEAVED: pred.load.if23: +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-INTERLEAVED: pred.load.continue24: +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-INTERLEAVED: pred.load.if25: +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-INTERLEAVED: pred.load.continue26: +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-INTERLEAVED: pred.load.if27: +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-INTERLEAVED: pred.load.continue28: +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVED: pred.load.if29: +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-INTERLEAVED: pred.load.continue30: +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-INTERLEAVED: pred.load.if31: +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-INTERLEAVED: pred.load.continue32: +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-INTERLEAVED: pred.load.if33: +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-INTERLEAVED: pred.load.continue34: +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-INTERLEAVED: pred.load.if35: +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-INTERLEAVED: pred.load.continue36: +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-INTERLEAVED: pred.load.if37: +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-INTERLEAVED: pred.load.continue38: +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-INTERLEAVED: pred.load.if39: +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-INTERLEAVED: pred.load.continue40: +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-INTERLEAVED: pred.load.if41: +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-INTERLEAVED: pred.load.continue42: +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-INTERLEAVED: pred.load.if43: +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-INTERLEAVED: pred.load.continue44: +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-INTERLEAVED: pred.load.if45: +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-INTERLEAVED: pred.load.continue46: +; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-INTERLEAVED: pred.load.if47: +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-INTERLEAVED: pred.load.continue48: +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-INTERLEAVED: pred.load.if49: +; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-INTERLEAVED: pred.load.continue50: +; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-INTERLEAVED: pred.load.if51: +; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-INTERLEAVED: pred.load.continue52: +; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-INTERLEAVED: pred.load.if53: +; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-INTERLEAVED: pred.load.continue54: +; CHECK-INTERLEAVED-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-INTERLEAVED: pred.load.if55: +; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-INTERLEAVED: pred.load.continue56: +; CHECK-INTERLEAVED-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-INTERLEAVED: pred.load.if57: +; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-INTERLEAVED: pred.load.continue58: +; CHECK-INTERLEAVED-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-INTERLEAVED: pred.load.if59: +; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-INTERLEAVED: pred.load.continue60: +; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVED: pred.load.if61: +; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVED: pred.load.continue62: +; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-INTERLEAVED-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1159,6 +2326,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-MAXBW: pred.load.if: +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-MAXBW: pred.load.continue: +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-MAXBW: pred.load.if1: +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-MAXBW: pred.load.continue2: +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-MAXBW: pred.load.if3: +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-MAXBW: pred.load.continue4: +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-MAXBW: pred.load.if5: +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-MAXBW: pred.load.continue6: +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-MAXBW: pred.load.if7: +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-MAXBW: pred.load.continue8: +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-MAXBW: pred.load.if9: +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-MAXBW: pred.load.continue10: +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-MAXBW: pred.load.if11: +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-MAXBW: pred.load.continue12: +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-MAXBW: pred.load.if13: +; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-MAXBW: pred.load.continue14: +; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-MAXBW: pred.load.if15: +; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-MAXBW: pred.load.continue16: +; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-MAXBW: pred.load.if17: +; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-MAXBW: pred.load.continue18: +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-MAXBW: pred.load.if19: +; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-MAXBW: pred.load.continue20: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-MAXBW: pred.load.if21: +; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-MAXBW: pred.load.continue22: +; CHECK-MAXBW-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-MAXBW: pred.load.if23: +; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-MAXBW: pred.load.continue24: +; CHECK-MAXBW-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-MAXBW: pred.load.if25: +; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-MAXBW: pred.load.continue26: +; CHECK-MAXBW-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-MAXBW: pred.load.if27: +; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-MAXBW: pred.load.continue28: +; CHECK-MAXBW-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-MAXBW: pred.load.if29: +; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-MAXBW: pred.load.continue30: +; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-MAXBW: pred.load.if31: +; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-MAXBW: pred.load.continue32: +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-MAXBW: pred.load.if33: +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-MAXBW: pred.load.continue34: +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-MAXBW: pred.load.if35: +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-MAXBW: pred.load.continue36: +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-MAXBW: pred.load.if37: +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-MAXBW: pred.load.continue38: +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-MAXBW: pred.load.if39: +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-MAXBW: pred.load.continue40: +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-MAXBW: pred.load.if41: +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-MAXBW: pred.load.continue42: +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-MAXBW: pred.load.if43: +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-MAXBW: pred.load.continue44: +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-MAXBW: pred.load.if45: +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-MAXBW: pred.load.continue46: +; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-MAXBW: pred.load.if47: +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-MAXBW: pred.load.continue48: +; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-MAXBW: pred.load.if49: +; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-MAXBW: pred.load.continue50: +; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-MAXBW: pred.load.if51: +; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-MAXBW: pred.load.continue52: +; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-MAXBW: pred.load.if53: +; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-MAXBW: pred.load.continue54: +; CHECK-MAXBW-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-MAXBW: pred.load.if55: +; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-MAXBW: pred.load.continue56: +; CHECK-MAXBW-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-MAXBW: pred.load.if57: +; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-MAXBW: pred.load.continue58: +; CHECK-MAXBW-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-MAXBW: pred.load.if59: +; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-MAXBW: pred.load.continue60: +; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-MAXBW: pred.load.if61: +; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-MAXBW: pred.load.continue62: +; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-MAXBW-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1186,7 +2672,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -1204,14 +2690,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: @@ -1226,7 +2712,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] @@ -1237,7 +2723,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -1264,7 +2750,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] @@ -1272,7 +2758,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: -; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVED: for.body: @@ -1287,7 +2773,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] @@ -1298,7 +2784,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -1316,14 +2802,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: -; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: for.body: @@ -1338,7 +2824,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] @@ -1361,7 +2847,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -1373,3 +2859,64 @@ for.exit: ; preds = %for.body !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. +; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. +; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index af2a7b966f700..a0214ae88c2d6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -12,12 +12,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] @@ -40,7 +40,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] @@ -58,7 +58,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -69,12 +69,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] @@ -111,7 +111,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] @@ -129,7 +129,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -140,12 +140,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] @@ -166,6 +166,31 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -182,7 +207,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -193,7 +218,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -271,13 +296,13 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: @@ -292,7 +317,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -301,7 +326,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -450,17 +475,38 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] ; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP142]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP142]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -538,8 +584,32 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] ; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -556,7 +626,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -569,12 +639,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -600,18 +670,50 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = extractelement [[TMP18]], i32 [[TMP22]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP16]], i32 [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -648,18 +750,50 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement [[TMP27]], i32 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP25]], i32 [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -685,6 +819,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement [[TMP27]], i32 [[TMP22]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = mul i32 [[TMP24]], 8 +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP25]], i32 [[TMP31]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -701,7 +867,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -714,12 +880,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -744,18 +910,50 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sub i32 [[TMP20]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement [[TMP17]], i32 [[TMP21]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP17]], i32 [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -784,18 +982,50 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sub i32 [[TMP31]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = extractelement [[TMP21]], i32 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -820,6 +1050,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement [[TMP21]], i32 [[TMP28]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP25]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -836,7 +1098,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %ext.b %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -912,6 +1174,73 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP41]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP35]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP30]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP23]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP45]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP46]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1033,6 +1362,77 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP81]], [[TMP80]] +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add [[TMP65]], [[TMP64]] +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX23]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add [[TMP49]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX24]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add [[TMP33]], [[TMP50]] +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX25]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP83]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX26:%.*]] = phi i32 [ [[TMP84]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX27:%.*]] = phi i32 [ [[TMP85]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX28:%.*]] = phi i32 [ [[TMP86]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX26]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX27]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX28]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVED-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP83]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1102,6 +1502,73 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE16]]) +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE17]]) +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP39]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP40]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP42]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-MAXBW-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-MAXBW-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-MAXBW-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-MAXBW-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-MAXBW-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-MAXBW-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-MAXBW-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -1195,6 +1662,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1246,6 +1734,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1279,6 +1788,31 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1344,6 +1878,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1386,6 +1944,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1428,6 +2010,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1457,12 +2063,12 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1483,18 +2089,49 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[TMP12]], i32 [[TMP19]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1528,18 +2165,50 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = extractelement [[TMP20]], i32 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1560,6 +2229,37 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP24]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 8 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 [[TMP19]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -1576,7 +2276,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -1618,6 +2318,31 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[TMP18]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[CONV3:%.*]] = zext i8 [[TMP19]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1665,6 +2390,32 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i64 +; CHECK-INTERLEAVED-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[CONV3:%.*]] = zext i8 [[TMP29]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1703,6 +2454,27 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[TMP17]] to i64 +; CHECK-MAXBW-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-MAXBW-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-MAXBW-NEXT: [[CONV3:%.*]] = zext i8 [[TMP18]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1756,6 +2528,14 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-INTERLEAVE1-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVE1-NEXT: ret void ; ; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2( ; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1809,6 +2589,41 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-INTERLEAVED-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVED-NEXT: ret void ; ; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2( ; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1839,6 +2654,14 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] ; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-MAXBW-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-MAXBW-NEXT: ret void ; entry: %cmp = icmp sgt i32 %n, 0 @@ -1883,7 +2706,7 @@ exit: ; preds = %for.exit, %entry define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1909,14 +2732,35 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1: exit.loopexit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1948,15 +2792,36 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED: exit.loopexit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1988,11 +2853,32 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-MAXBW: exit.loopexit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret i64 [[RESULT]] ; entry: %cmp = icmp eq i64 %n, 0 @@ -2021,7 +2907,7 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2047,14 +2933,35 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1: exit.loopexit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2086,15 +2993,36 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED: exit.loopexit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2126,11 +3054,32 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-MAXBW: exit.loopexit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret i64 [[RESULT]] ; entry: %cmp = icmp eq i64 %n, 0 @@ -2162,3 +3111,84 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +;. +; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +;. +; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]} +;. +; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll index f24b115ab9f99..3561f52df9490 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -8,7 +8,7 @@ define i32 @not_dotp(ptr %a, ptr %b) { ; CHECK-LABEL: define i32 @not_dotp( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -35,7 +35,7 @@ define i32 @not_dotp(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 ; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; entry: @@ -53,7 +53,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1000 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index ac054f569e11b..1d4b808a612a0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -148,18 +148,16 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-LABEL: define void @trunc_store( ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: iter.check: -; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; DEFAULT: vector.memcheck: +; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000 ; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; DEFAULT-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] -; DEFAULT-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]] +; DEFAULT-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] ; DEFAULT-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; DEFAULT-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer @@ -180,46 +178,36 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] ; DEFAULT-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; DEFAULT-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 +; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]] -; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; DEFAULT: vec.epilog.ph: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; DEFAULT-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 -; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]] -; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; DEFAULT-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP24:%.*]] = trunc [[BROADCAST_SPLAT7]] to +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP15:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX5]], 0 -; DEFAULT-NEXT: [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP22]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP23:%.*]] = trunc [[BROADCAST_SPLAT]] to -; DEFAULT-NEXT: [[TMP25:%.*]] = and [[TMP23]], [[TMP24]] +; DEFAULT-NEXT: [[TMP16:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP18:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP14:%.*]] = and <8 x i8> [[TMP18]], [[TMP15]] ; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]] ; DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; DEFAULT-NEXT: store [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] -; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]] -; DEFAULT-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; DEFAULT-NEXT: store <8 x i8> [[TMP14]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] +; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 8 +; DEFAULT-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1000 +; DEFAULT-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: -; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -230,7 +218,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; DEFAULT-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void @@ -238,36 +226,49 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-LABEL: define void @trunc_store( ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; PRED-NEXT: entry: -; PRED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; PRED: vector.memcheck: +; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000 ; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] -; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]] +; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] ; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; PRED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: -; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8> +; PRED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP10]], 2 +; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 1000, [[TMP2]] +; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; PRED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1000) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8> -; PRED-NEXT: [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]] +; PRED-NEXT: [[TMP7:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] +; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; PRED-NEXT: [[TMP8:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; PRED-NEXT: [[TMP9:%.*]] = and [[TMP8]], [[TMP11]] ; PRED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; PRED-NEXT: store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; PRED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; PRED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP9]], ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]]), !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000) +; PRED-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; PRED-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -278,7 +279,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; PRED-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void @@ -295,7 +296,7 @@ loop: %gep = getelementptr i8, ptr %dst, i64 %iv store i8 %trunc, ptr %gep, align 1 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1000 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll index fb5d513dfbd75..8333c3193d799 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -972,7 +972,7 @@ loop: %red.next = fadd double %for, %red %for.next = sitofp i32 %iv to double %iv.next = add nsw i32 %iv, 1 - %ec = icmp eq i32 %iv.next, 0 + %ec = icmp eq i32 %iv.next, 1024 br i1 %ec, label %exit, label %loop, !llvm.loop !13 exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll index 7778f01c58dc3..91dd6e475ec47 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll @@ -23,7 +23,7 @@ loop.body: %add = add i64 %a, %b store i64 %add, ptr %addr %iv.next = add nsw i32 %iv, 1 - %cond = icmp ne i32 %iv.next, 0 + %cond = icmp ne i32 %iv.next, 1000 br i1 %cond, label %loop.body, label %exit, !llvm.loop !0 exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 5dd9f8ff97cca..ccf8540b4ebf7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -10,7 +10,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<0> = original trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): vector.ph @@ -42,7 +42,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> ; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<%1> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: @@ -63,7 +63,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a ; CHECK-NEXT: IR %add = add i32 %mul, %accum ; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 -; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -86,7 +86,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %exit, label %for.body exit: diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll index 4966ddd299492..6b201fdf21d21 100644 --- a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll @@ -8,17 +8,17 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" ; Function Attrs: optsize -define i32 @f() #0 { +define i32 @f(ptr %src) #0 { entry: br label %loop loop: %g.016 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %loop ] %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %0 = load i8, ptr undef, align 1 - %g.1.lcssa = add i32 %g.016, undef + %0 = load i8, ptr %src, align 1 + %g.1.lcssa = add i32 %g.016, 1 %iv.next = add nsw i32 %iv, 1 - %exitcond = icmp eq i32 %iv.next, 0 + %exitcond = icmp eq i32 %iv.next, 1000 br i1 %exitcond, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll index 8b47aee6bf389..0c5db437c177d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll @@ -64,7 +64,7 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-LABEL: define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[EXT_0:%.*]] = sext i8 0 to i32 -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -90,14 +90,14 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <2 x i32> [[STEP_ADD4]], splat (i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOPS:.*]] ; CHECK: [[LOOPS]]: @@ -111,7 +111,7 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[RED_NEXT_V:%.*]] = select i1 [[C]], i64 [[AND]], i64 [[CONV_1]] ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED_NEXT_V]], [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOPS]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOPS]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] @@ -132,7 +132,7 @@ loops: %red.next.v = select i1 %c, i64 %and, i64 %conv.1 %red.next = or i64 %red.next.v, %red %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1000 br i1 %ec, label %exit, label %loops exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 5c0aeb526e50c..bd28e28ddff95 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -1159,47 +1159,39 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-LABEL: @narrowed_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP:%.*]] to i32 -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = and <16 x i32> [[VEC_PHI]], splat (i32 1) +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[VEC_PHI1]], splat (i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP0]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6]] = zext <16 x i1> [[TMP4]] to <16 x i32> ; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[BIN_RDX]]) -; CHECK-NEXT: [[TMP12:%.*]] = zext i1 [[TMP11]] to i32 -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP10]]) +; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32 +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 17, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INC:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[OR:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR13]], 1 ; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 16 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP1]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[OR_LCSSA]] ; entry: @@ -1212,7 +1204,7 @@ loop: %and = and i32 %or13, 1 %or = or i32 %and, %conv %inc = add i32 %iv, 1 - %ec = icmp eq i32 %iv, 0 + %ec = icmp eq i32 %iv, 16 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index 0686395567cc2..68695a8b1282c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -352,9 +352,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-LABEL: define void @drop_zext_nneg( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[P1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -372,12 +370,12 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-NEXT: store double [[TMP6]], ptr [[P1]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[NEXT:%.*]], [[ELSE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -394,7 +392,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-NEXT: [[PHI:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ 0.000000e+00, [[BODY]] ] ; CHECK-NEXT: store double [[PHI]], ptr [[P1]], align 8 ; CHECK-NEXT: [[NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[NEXT]], 0 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[NEXT]], 1024 ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -419,7 +417,7 @@ else: %phi = phi double [ %1, %then ], [ 0.000000e+00, %body ] store double %phi, ptr %p1, align 8 %next = add i64 %iv, 1 - %cmp = icmp eq i64 %next, 0 + %cmp = icmp eq i64 %next, 1024 br i1 %cmp, label %exit, label %body exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 5c9375eb1d17f..d18d618c6a447 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -8,78 +8,7 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-LABEL: define void @test_free_instructions_feeding_geps_for_interleave_groups( ; CHECK-SAME: ptr noalias [[P_INVAR:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; CHECK: [[VECTOR_SCEVCHECK]]: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 8 -; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP]] -; CHECK-NEXT: [[TMP3:%.*]] = or i1 [[TMP2]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST_1]], i64 12 -; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 0, [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult ptr [[TMP5]], [[SCEVGEP1]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW4]] -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[DST_1]], i64 4 -; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 0, [[MUL_RESULT7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult ptr [[TMP9]], [[SCEVGEP5]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW8]] -; CHECK-NEXT: [[MUL9:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT10:%.*]] = extractvalue { i64, i1 } [[MUL9]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW11:%.*]] = extractvalue { i64, i1 } [[MUL9]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[MUL_RESULT10]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ult ptr [[TMP13]], [[DST_1]] -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW11]] -; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[DST_2]], i64 8 -; CHECK-NEXT: [[MUL13:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT14:%.*]] = extractvalue { i64, i1 } [[MUL13]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW15:%.*]] = extractvalue { i64, i1 } [[MUL13]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 0, [[MUL_RESULT14]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SCEVGEP12]], i64 [[MUL_RESULT14]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult ptr [[TMP17]], [[SCEVGEP12]] -; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[MUL_OVERFLOW15]] -; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr i8, ptr [[DST_2]], i64 12 -; CHECK-NEXT: [[MUL17:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT18:%.*]] = extractvalue { i64, i1 } [[MUL17]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW19:%.*]] = extractvalue { i64, i1 } [[MUL17]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = sub i64 0, [[MUL_RESULT18]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SCEVGEP16]], i64 [[MUL_RESULT18]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp ult ptr [[TMP21]], [[SCEVGEP16]] -; CHECK-NEXT: [[TMP23:%.*]] = or i1 [[TMP22]], [[MUL_OVERFLOW19]] -; CHECK-NEXT: [[SCEVGEP20:%.*]] = getelementptr i8, ptr [[DST_2]], i64 4 -; CHECK-NEXT: [[MUL21:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT22:%.*]] = extractvalue { i64, i1 } [[MUL21]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW23:%.*]] = extractvalue { i64, i1 } [[MUL21]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = sub i64 0, [[MUL_RESULT22]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[SCEVGEP20]], i64 [[MUL_RESULT22]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp ult ptr [[TMP25]], [[SCEVGEP20]] -; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP26]], [[MUL_OVERFLOW23]] -; CHECK-NEXT: [[MUL24:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT25:%.*]] = extractvalue { i64, i1 } [[MUL24]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW26:%.*]] = extractvalue { i64, i1 } [[MUL24]], 1 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 0, [[MUL_RESULT25]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[DST_2]], i64 [[MUL_RESULT25]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult ptr [[TMP29]], [[DST_2]] -; CHECK-NEXT: [[TMP31:%.*]] = or i1 [[TMP30]], [[MUL_OVERFLOW26]] -; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP33:%.*]] = or i1 [[TMP32]], [[TMP11]] -; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP15]] -; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP19]] -; CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP23]] -; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP36]], [[TMP27]] -; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[TMP37]], [[TMP31]] -; CHECK-NEXT: br i1 [[TMP38]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -106,12 +35,12 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP49]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -139,7 +68,7 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-NEXT: [[GEP_DST_276:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[ADD_3]] ; CHECK-NEXT: store float 0.000000e+00, ptr [[GEP_DST_276]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void @@ -173,7 +102,7 @@ loop: %gep.dst.276 = getelementptr float, ptr %dst.2, i64 %add.3 store float 0.000000e+00, ptr %gep.dst.276, align 4 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop exit: @@ -771,7 +700,7 @@ attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} ; CHECK: [[META6]] = !{[[META7:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index cfae26a3a4257..4a371af87d67d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -11,7 +11,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0 -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer @@ -58,12 +58,12 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 ; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] @@ -78,7 +78,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void @@ -103,7 +103,7 @@ loop.latch: %gep = getelementptr i64, ptr %dst, i64 %add store i64 0, ptr %gep, align 8 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop.header exit: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll new file mode 100644 index 0000000000000..b90580638a4cd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=2 -epilogue-vectorization-force-VF=2 -S %s | FileCheck %s + +target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" + +; Test case for https://github.com/llvm/llvm-project/issues/122558. +define void @vector_trip_count_0_as_btc_is_all_1(ptr %dst) #0 { +; CHECK-LABEL: define void @vector_trip_count_0_as_btc_is_all_1( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[IV_NEXT]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, -1 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %iv.next, ptr %gep, align 4 + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll index fc71f8a934047..01a68f01b8097 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -654,7 +654,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 3 ; CHECK-NEXT: store double [[TMP6]], ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 @@ -664,7 +664,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ] ; CHECK-NEXT: phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ] ; CHECK-NEXT: phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ] -; CHECK-NEXT: %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ] +; CHECK-NEXT: %bc.resume.val = phi i64 [ 1000, %middle.block ], [ 0, %Entry ] ; CHECK: End: ; CHECK-NEXT: = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ] ; CHECK-NEXT: = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ] @@ -684,7 +684,7 @@ Loop: %iv.next= add nuw nsw i64 %iv, 1 %l2 = load double, ptr %b, align 8 store double %div, ptr %p, align 8 - %cond = icmp eq i64 %iv.next, 0 + %cond = icmp eq i64 %iv.next, 1000 br i1 %cond, label %End, label %Loop End: diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index 7db53d8ffcedf..2a85761da1e52 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -555,29 +555,31 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-LABEL: @minimal_bit_widths_with_aliasing_store( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] -; UNROLL: for.body: -; UNROLL-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; UNROLL: vector.body: +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; UNROLL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1 ; UNROLL-NEXT: store i8 0, ptr [[TMP2]], align 1 +; UNROLL-NEXT: store i8 0, ptr [[TMP4]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; UNROLL: if.then: -; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 +; UNROLL: pred.store.if: +; UNROLL-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1 +; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 ; UNROLL-NEXT: br label [[FOR_INC]] -; UNROLL: for.inc: -; UNROLL-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 -; UNROLL-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; UNROLL-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL: pred.store.continue2: +; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; UNROLL-NEXT: br i1 [[TMP6]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void ; ; UNROLL-NOSIMPLIFY-LABEL: @minimal_bit_widths_with_aliasing_store( ; UNROLL-NOSIMPLIFY-NEXT: entry: -; UNROLL-NOSIMPLIFY-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: @@ -601,13 +603,13 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.continue2: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -631,24 +633,36 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; ; VEC-LABEL: @minimal_bit_widths_with_aliasing_store( ; VEC-NEXT: entry: +; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C1:%.*]], i64 0 +; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[FOR_BODY:%.*]] -; VEC: for.body: -; VEC-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; VEC-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; VEC: vector.body: +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 -; VEC-NEXT: store i8 0, ptr [[TMP2]], align 1 -; VEC-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; VEC: if.then: -; VEC-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; VEC-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 +; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0 +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1 +; VEC-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP3]], align 1 +; VEC-NEXT: [[C:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; VEC: pred.store.if: +; VEC-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]] +; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 ; VEC-NEXT: br label [[FOR_INC]] -; VEC: for.inc: -; VEC-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 -; VEC-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 -; VEC-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; VEC-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC: pred.store.continue: +; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC: pred.store.if1: +; VEC-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 +; VEC-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP7]] +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 1 +; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] +; VEC: pred.store.continue2: +; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VEC-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: for.end: ; VEC-NEXT: ret void ; @@ -657,7 +671,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] - %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] + %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ] %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 store i8 0, ptr %tmp2 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 96311de673d8a..939709b91062e 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3031,128 +3031,67 @@ exit: } ; This loop has a backedge taken count of i32_max. We need to check for this -; condition and branch directly to the scalar loop. +; condition and can skip vectorizing. - - -define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { +define i32 @max_i32_backedgetaken() { ; CHECK-LABEL: @max_i32_backedgetaken( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], splat (i32 4) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; CHECK-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ] ; CHECK-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; IND-LABEL: @max_i32_backedgetaken( ; IND-NEXT: entry: -; IND-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; IND: vector.ph: -; IND-NEXT: br label [[VECTOR_BODY:%.*]] -; IND: vector.body: -; IND-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; IND: middle.block: -; IND-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; IND: scalar.ph: ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: -; IND-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; IND-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; IND-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; IND-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; IND-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; IND-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; IND: exit: ; IND-NEXT: ret i32 0 ; ; UNROLL-LABEL: @max_i32_backedgetaken( ; UNROLL-NEXT: entry: -; UNROLL-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL: vector.ph: -; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL: vector.body: -; UNROLL-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; UNROLL: middle.block: -; UNROLL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; UNROLL: scalar.ph: ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: -; UNROLL-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; UNROLL-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; UNROLL-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; UNROLL: exit: ; UNROLL-NEXT: ret i32 0 ; ; UNROLL-NO-IC-LABEL: @max_i32_backedgetaken( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], splat (i32 4) -; UNROLL-NO-IC-NEXT: [[TMP1]] = and <2 x i32> [[VEC_PHI1]], splat (i32 4) -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) -; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; UNROLL-NO-IC-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; INTERLEAVE-LABEL: @max_i32_backedgetaken( ; INTERLEAVE-NEXT: entry: -; INTERLEAVE-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; INTERLEAVE: vector.ph: -; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] -; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: -; INTERLEAVE-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; INTERLEAVE-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; INTERLEAVE-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; INTERLEAVE-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; INTERLEAVE: exit: ; INTERLEAVE-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index 5bc832fbd6842..c3164762e8130 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -317,7 +317,7 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) { ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: store ptr [[TMP11]], ptr %dst, align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label %middle.block, label %vector.body entry: @@ -332,7 +332,7 @@ loop: %cast.2 = inttoptr i64 %add to ptr store ptr %cast.2, ptr %dst, align 8 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll index e5e7a1c748086..186470a1e8b78 100644 --- a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll +++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll @@ -4,7 +4,7 @@ define void @d() { ; CHECK-LABEL: define void @d() { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -19,12 +19,12 @@ define void @d() { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I7:%.*]], [[LOOP]] ] @@ -34,7 +34,7 @@ define void @d() { ; CHECK-NEXT: [[I6:%.*]] = select i1 [[I5]], float 0.000000e+00, float 0.000000e+00 ; CHECK-NEXT: store float [[I6]], ptr [[I4]], align 4 ; CHECK-NEXT: [[I7]] = add i64 [[I]], 1 -; CHECK-NEXT: [[I8:%.*]] = icmp eq i64 [[I7]], 0 +; CHECK-NEXT: [[I8:%.*]] = icmp eq i64 [[I7]], 128 ; CHECK-NEXT: br i1 [[I8]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -50,7 +50,7 @@ loop: %i6 = select i1 %i5, float 0.0, float 0.0 store float %i6, ptr %i4, align 4 %i7 = add i64 %i, 1 - %i8 = icmp eq i64 %i7, 0 + %i8 = icmp eq i64 %i7, 128 br i1 %i8, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 1bfb34165e52e..065e38e9fa5cf 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -821,3 +821,122 @@ loop: exit: ret void } + +define void @multiple_ivs_wide(ptr %dst) { +; CHECK-LABEL: @multiple_ivs_wide( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 6 +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 64 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 64, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 64, [[VEC_EPILOG_ITER_CHECK]] ], [ -64, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX1]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 +; CHECK-NEXT: store i32 [[TMP26]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP23]], align 4 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8) +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ 128, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 128, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ 64, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ -64, [[ITER_CHECK]] ], [ 64, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2 +; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 2 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[IV_2_NEXT]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 128 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +; CHECK-PROFITABLE-BY-DEFAULT-LABEL: @multiple_ivs_wide( +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: entry: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[LOOP:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: loop: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i32 [ -64, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i32 [[IV_2_NEXT]], ptr [[GEP]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 128 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-PROFITABLE-BY-DEFAULT: exit: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i32 [ -64, %entry ], [ %iv.2.next, %loop ] + %iv.next = add i32 %iv, 2 + %iv.2.next = add i32 %iv.2, 2 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %iv.2.next, ptr %gep, align 4 + %ec = icmp eq i32 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: ; preds = %loop + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 468b3ca337d7b..5f0a0c5d69a42 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -1528,19 +1528,17 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[PRE_C:%.*]] = icmp ugt i32 [[ACOLS]], 0 ; CHECK-NEXT: br i1 [[PRE_C]], label [[EXIT:%.*]], label [[OUTER_HEADER_PREHEADER:%.*]] ; CHECK: outer.header.preheader: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 8 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C]], i64 34359738368 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 8000 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 8 ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER]] ] ; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[OUTER_IV]], [[ACOLS]] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr double, ptr [[A]], i32 [[MUL_US]] -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -1549,18 +1547,18 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]], !noalias [[META72:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 -; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72]] +; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[OUTER_HEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_HEADER]] ] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] @@ -1568,7 +1566,7 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 ; CHECK-NEXT: store double [[L]], ptr [[GEP_C]], align 8 ; CHECK-NEXT: [[INNER_IV_NEXT]] = add i32 [[INNER_IV]], 1 -; CHECK-NEXT: [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 0 +; CHECK-NEXT: [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[INNER_C]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP75:![0-9]+]] ; CHECK: outer.latch: ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i32 [[OUTER_IV]], 1 @@ -1595,7 +1593,7 @@ inner: %l = load double, ptr %arrayidx.us, align 8 store double %l, ptr %gep.C, align 8 %inner.iv.next = add i32 %inner.iv, 1 - %inner.c = icmp eq i32 %inner.iv.next, 0 + %inner.c = icmp eq i32 %inner.iv.next, 1000 br i1 %inner.c, label %outer.latch, label %inner outer.latch: diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index 2f2d715790229..20053cd8661d1 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -6,12 +6,12 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-SAME: (i32 [[ARG:%.*]], ptr [[DST:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ARG]], 1 -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = sub i32 -1, [[ARG]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[ADD]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 [[ADD]] -; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 -1) +; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 1023) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = sub i32 0, [[MUL_RESULT]] @@ -19,8 +19,7 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i1 [[TMP4]], i1 false ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[ADD]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -28,28 +27,28 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP12]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP14]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP16]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP11]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP13]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP15]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[LOOP]] ] @@ -59,7 +58,7 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr double, ptr [[DST]], i64 [[ZEXT]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[GETELEMENTPTR]], align 8 ; CHECK-NEXT: [[ADD2]] = add i64 [[PHI]], 1 -; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[ADD2]], 0 +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[ADD2]], 1024 ; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -76,7 +75,7 @@ loop: %getelementptr = getelementptr double, ptr %dst, i64 %zext store double 0.000000e+00, ptr %getelementptr, align 8 %add2 = add i64 %phi, 1 - %icmp = icmp eq i64 %add2, 0 + %icmp = icmp eq i64 %add2, 1024 br i1 %icmp, label %exit, label %loop exit: @@ -89,36 +88,34 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[X]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[MUL]] to i64 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 4294967264, [[TMP0]] -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 992, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]] -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967264 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -2, [[MIDDLE_BLOCK]] ], [ 30, [[VECTOR_SCEVCHECK]] ], [ 30, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ], [ [[START]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1022, [[MIDDLE_BLOCK]] ], [ 30, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ] @@ -127,7 +124,7 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[IV]] ; CHECK-NEXT: store ptr [[P_0]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 1024 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -143,7 +140,7 @@ for.cond: ; preds = %for.body, %entry %arrayidx = getelementptr ptr, ptr %call, i32 %iv store ptr %p.0, ptr %arrayidx, align 4 %inc = add i32 %iv, 1 - %tobool.not = icmp eq i32 %iv, 0 + %tobool.not = icmp eq i32 %iv, 1024 br i1 %tobool.not, label %for.end, label %for.cond for.end: ; preds = %for.cond diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll index b6391e0457697..b58a14952f7a8 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -10,13 +10,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: ir<0> = original trip-count +; CHECK-NEXT: ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<1024> + vp<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -49,18 +49,18 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: ; CHECK-NEXT: EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<1024> ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: IR %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph) -; CHECK-NEXT: IR %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) +; CHECK-NEXT: IR %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1024, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) ; CHECK: IR %tmp5 = trunc i32 %tmp4 to i8 ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -73,7 +73,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] - %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] + %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1024, %entry ] %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 store i8 0, ptr %tmp2 From 43f203da8986bfeb071b5dc381491abbc1126e52 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Tue, 14 Jan 2025 14:17:09 -0800 Subject: [PATCH 05/82] [bazel] add missing dep --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 9d33e94e4432b..47e632098a41b 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -2168,6 +2168,7 @@ cc_library( ":sema", ":serialization_attr_gen", ":static_analyzer_core_options", + ":support", ":type_nodes_gen", "//llvm:BitReader", "//llvm:BitWriter", From a829ebadd4211bec24e99f4395ef855eff456eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 15 Jan 2025 00:18:52 +0200 Subject: [PATCH 06/82] [Triple] Ignore the vendor field for MinGW, wrt LTO/IR compatibility (#122801) For MinGW environments, the regular C/C++ toolchains usually use "w64" for the vendor field in triples, while Rust toolchains usually use "pc" in the vendor field. The differences in the vendor field have no bearing on whether the IR is compatible on this platform. (This probably goes for most other OSes as well, but limiting the scope of the change to the specific case.) Add a unit test for the isCompatibleWith, including some existing test cases found in existing tests. --- llvm/lib/TargetParser/Triple.cpp | 21 +++++++++--- llvm/unittests/TargetParser/TripleTest.cpp | 38 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 4c1de09e91f21..855889ac05620 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -2024,6 +2024,10 @@ bool Triple::isLittleEndian() const { } bool Triple::isCompatibleWith(const Triple &Other) const { + // On MinGW, C code is usually built with a "w64" vendor, while Rust + // often uses a "pc" vendor. + bool IgnoreVendor = isWindowsGNUEnvironment(); + // ARM and Thumb triples are compatible, if subarch, vendor and OS match. if ((getArch() == Triple::thumb && Other.getArch() == Triple::arm) || (getArch() == Triple::arm && Other.getArch() == Triple::thumb) || @@ -2034,17 +2038,24 @@ bool Triple::isCompatibleWith(const Triple &Other) const { getVendor() == Other.getVendor() && getOS() == Other.getOS(); else return getSubArch() == Other.getSubArch() && - getVendor() == Other.getVendor() && getOS() == Other.getOS() && + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS() && getEnvironment() == Other.getEnvironment() && getObjectFormat() == Other.getObjectFormat(); } - // If vendor is apple, ignore the version number. + // If vendor is apple, ignore the version number (the environment field) + // and the object format. if (getVendor() == Triple::Apple) return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() && - getVendor() == Other.getVendor() && getOS() == Other.getOS(); - - return *this == Other; + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS(); + + return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() && + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS() && + getEnvironment() == Other.getEnvironment() && + getObjectFormat() == Other.getObjectFormat(); } std::string Triple::merge(const Triple &Other) const { diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 7fb7625f8c2d1..3217014aa69af 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/TargetParser/Triple.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/VersionTuple.h" #include "gtest/gtest.h" @@ -2737,4 +2738,41 @@ TEST(TripleTest, DXILNormaizeWithVersion) { EXPECT_EQ("dxilv1.0-pc-shadermodel5.0-compute", Triple::normalize("dxil-shadermodel5.0-pc-compute")); } + +TEST(TripleTest, isCompatibleWith) { + struct { + const char *A; + const char *B; + bool Result; + } Cases[] = { + {"armv7-linux-gnueabihf", "thumbv7-linux-gnueabihf", true}, + {"armv4-none-unknown-eabi", "thumbv6-unknown-linux-gnueabihf", false}, + {"x86_64-apple-macosx10.9.0", "x86_64-apple-macosx10.10.0", true}, + {"x86_64-apple-macosx10.9.0", "i386-apple-macosx10.9.0", false}, + {"x86_64-apple-macosx10.9.0", "x86_64h-apple-macosx10.9.0", true}, + {"x86_64-unknown-linux-gnu", "x86_64-unknown-linux-gnu", true}, + {"x86_64-unknown-linux-gnu", "i386-unknown-linux-gnu", false}, + {"x86_64-unknown-linux-gnu", "x86_64h-unknown-linux-gnu", true}, + {"x86_64-pc-windows-gnu", "x86_64-pc-windows-msvc", false}, + {"x86_64-pc-windows-msvc", "x86_64-pc-windows-msvc-elf", false}, + {"i686-w64-windows-gnu", "i386-w64-windows-gnu", true}, + {"x86_64-w64-windows-gnu", "x86_64-pc-windows-gnu", true}, + {"armv7-w64-windows-gnu", "thumbv7-pc-windows-gnu", true}, + }; + + auto DoTest = [](const char *A, const char *B, + bool Result) -> testing::AssertionResult { + if (Triple(A).isCompatibleWith(Triple(B)) != Result) { + return testing::AssertionFailure() + << llvm::formatv("Triple {0} and {1} were expected to be {2}", A, + B, Result ? "compatible" : "incompatible"); + } + return testing::AssertionSuccess(); + }; + for (const auto &C : Cases) { + EXPECT_TRUE(DoTest(C.A, C.B, C.Result)); + // Test that the comparison is commutative. + EXPECT_TRUE(DoTest(C.B, C.A, C.Result)); + } +} } // end anonymous namespace From 72225ca27f561b74da292433400f250592d73b13 Mon Sep 17 00:00:00 2001 From: Min Hsu Date: Tue, 14 Jan 2025 14:08:56 -0800 Subject: [PATCH 07/82] Revert "Reland: "[Exegesis] Add the ability to dry-run the measurement phase (#121991)" (#122775)" This reverts commit a39aaf35d3858a5542f532e399482c2bb0259dac and 63d3bd6d0caf8185aba49540fe2f67512fdf3a98. Due to test failures on MacOSX. --- llvm/docs/CommandGuide/llvm-exegesis.rst | 1 - .../X86/dry-run-measurement.test | 10 ------ llvm/test/tools/llvm-exegesis/lit.local.cfg | 6 ---- .../tools/llvm-exegesis/lib/BenchmarkResult.h | 1 - .../llvm-exegesis/lib/BenchmarkRunner.cpp | 33 +++++-------------- llvm/tools/llvm-exegesis/lib/Target.cpp | 4 +-- llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 9 ++--- 7 files changed, 13 insertions(+), 51 deletions(-) delete mode 100644 llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index d357c2ceea418..8266d891a5e6b 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -301,7 +301,6 @@ OPTIONS * ``prepare-and-assemble-snippet``: Same as ``prepare-snippet``, but also dumps an excerpt of the sequence (hex encoded). * ``assemble-measured-code``: Same as ``prepare-and-assemble-snippet``. but also creates the full sequence that can be dumped to a file using ``--dump-object-to-disk``. * ``measure``: Same as ``assemble-measured-code``, but also runs the measurement. - * ``dry-run-measurement``: Same as measure, but does not actually execute the snippet. .. option:: --x86-lbr-sample-period= diff --git a/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test b/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test deleted file mode 100644 index cf13fd04261ca..0000000000000 --- a/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: llvm-exegesis --mode=latency --opcode-name=LEA64r --use-dummy-perf-counters --benchmark-phase=dry-run-measurement | FileCheck %s -# REQUIRES: exegesis-can-execute-x86_64 - -# This test makes sure that llvm-exegesis doesn't execute any snippet in the presence of dry-run measurement. - -# Should not contain misleading results. -# CHECK: measurements: [] - -# Should not contain any error message. -# CHECK: error: '' diff --git a/llvm/test/tools/llvm-exegesis/lit.local.cfg b/llvm/test/tools/llvm-exegesis/lit.local.cfg index 343f34c58673e..a51a2d73442fa 100644 --- a/llvm/test/tools/llvm-exegesis/lit.local.cfg +++ b/llvm/test/tools/llvm-exegesis/lit.local.cfg @@ -30,12 +30,6 @@ def can_use_perf_counters(mode, extra_options=[]): print("could not exec llvm-exegesis") return False -# LLJIT builds its own TargetMachine using arch designated by LLVM_TARGET_ARCH, which -# is default to host. We don't want tests that use LLJIT (but not necessarily -# execute the snippets) to run on machines that are not even supported by -# exegesis. -if config.root.native_target in ["AArch64", "Mips", "PowerPC", "RISCV", "X86"]: - config.available_features.add("native-registered-exegesis-target") for arch in ["aarch64", "mips", "powerpc", "x86_64"]: if can_execute_generated_snippets(arch): diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h index 5480d85616878..3c09a8380146e 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -38,7 +38,6 @@ enum class BenchmarkPhaseSelectorE { PrepareAndAssembleSnippet, AssembleMeasuredCode, Measure, - DryRunMeasure, }; enum class BenchmarkFilter { All, RegOnly, WithMem }; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index cc46f7feb6cf7..a7771b99e97b1 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -99,7 +99,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { static Expected> create(const LLVMState &State, object::OwningBinary Obj, BenchmarkRunner::ScratchSpace *Scratch, - std::optional BenchmarkProcessCPU, bool DryRun) { + std::optional BenchmarkProcessCPU) { Expected EF = ExecutableFunction::create(State.createTargetMachine(), std::move(Obj)); @@ -107,17 +107,14 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { return EF.takeError(); return std::unique_ptr( - new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch, - DryRun)); + new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch)); } private: InProcessFunctionExecutorImpl(const LLVMState &State, ExecutableFunction Function, - BenchmarkRunner::ScratchSpace *Scratch, - bool DryRun) - : State(State), Function(std::move(Function)), Scratch(Scratch), - DryRun(DryRun) {} + BenchmarkRunner::ScratchSpace *Scratch) + : State(State), Function(std::move(Function)), Scratch(Scratch) {} static void accumulateCounterValues(const SmallVector &NewValues, SmallVector *Result) { @@ -146,14 +143,9 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { CrashRecoveryContext CRC; CrashRecoveryContext::Enable(); const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() { - if (DryRun) { - Counter->start(); - Counter->stop(); - } else { - Counter->start(); - this->Function(ScratchPtr); - Counter->stop(); - } + Counter->start(); + this->Function(ScratchPtr); + Counter->stop(); }); CrashRecoveryContext::Disable(); PS.reset(); @@ -185,7 +177,6 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { const LLVMState &State; const ExecutableFunction Function; BenchmarkRunner::ScratchSpace *const Scratch; - bool DryRun = false; }; #ifdef __linux__ @@ -673,9 +664,6 @@ Expected> BenchmarkRunner::createFunctionExecutor( object::OwningBinary ObjectFile, const BenchmarkKey &Key, std::optional BenchmarkProcessCPU) const { - bool DryRun = - BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::DryRunMeasure; - switch (ExecutionMode) { case ExecutionModeE::InProcess: { if (BenchmarkProcessCPU.has_value()) @@ -683,8 +671,7 @@ BenchmarkRunner::createFunctionExecutor( "support benchmark core pinning."); auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create( - State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU, - DryRun); + State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU); if (!InProcessExecutorOrErr) return InProcessExecutorOrErr.takeError(); @@ -692,10 +679,6 @@ BenchmarkRunner::createFunctionExecutor( } case ExecutionModeE::SubProcess: { #ifdef __linux__ - if (DryRun) - return make_error("The subprocess execution mode cannot " - "dry-run measurement at this moment."); - auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create( State, std::move(ObjectFile), Key, BenchmarkProcessCPU); if (!SubProcessExecutorOrErr) diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp index e2251ff978888..29e58692f0e92 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/Target.cpp @@ -98,7 +98,7 @@ ExegesisTarget::createBenchmarkRunner( return nullptr; case Benchmark::Latency: case Benchmark::InverseThroughput: - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !PfmCounters.CycleCounter) { const char *ModeName = Mode == Benchmark::Latency ? "latency" @@ -116,7 +116,7 @@ ExegesisTarget::createBenchmarkRunner( State, Mode, BenchmarkPhaseSelector, ResultAggMode, ExecutionMode, ValidationCounters, BenchmarkRepeatCount); case Benchmark::Uops: - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !PfmCounters.UopsCounter && !PfmCounters.IssueCounters) return make_error( "can't run 'uops' mode, sched model does not define uops or issue " diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 07bd44ee64f1f..fa37e05956be8 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -132,10 +132,7 @@ static cl::opt BenchmarkPhaseSelector( clEnumValN( BenchmarkPhaseSelectorE::Measure, "measure", "Same as prepare-measured-code, but also runs the measurement " - "(default)"), - clEnumValN( - BenchmarkPhaseSelectorE::DryRunMeasure, "dry-run-measurement", - "Same as measure, but does not actually execute the snippet")), + "(default)")), cl::init(BenchmarkPhaseSelectorE::Measure)); static cl::opt @@ -479,7 +476,7 @@ static void runBenchmarkConfigurations( } void benchmarkMain() { - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !UseDummyPerfCounters) { #ifndef HAVE_LIBPFM ExitWithError( @@ -504,7 +501,7 @@ void benchmarkMain() { // Preliminary check to ensure features needed for requested // benchmark mode are present on target CPU and/or OS. - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure) + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure) ExitOnErr(State.getExegesisTarget().checkFeatureSupport()); if (ExecutionMode == BenchmarkRunner::ExecutionModeE::SubProcess && From 0b3912622ed4f3cdd311b02798f8689d52ed4602 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 14 Jan 2025 22:41:31 +0000 Subject: [PATCH 08/82] [ARM] Update LV test in test/Codegen/ARM after 1de3dc7d23. --- llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll b/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll index 8038dad3fe92f..cee7fde89b070 100644 --- a/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll +++ b/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll @@ -26,7 +26,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %next19.i.i = getelementptr inbounds %struct.CvNode1D, ptr %dst, i32 %i.1424.i.i, i32 1 store ptr %dst, ptr %next19.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -52,7 +52,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %val.i.i = getelementptr inbounds %struct.CvNode1D2, ptr %arrayidx15.i.i1427, i32 0, i32 1 store float 0xC415AF1D80000000, ptr %val.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -79,7 +79,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store float %loadf, ptr %dst.ptr, align 4 store ptr %loadp, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -107,7 +107,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store float %loadf, ptr %dst.ptr, align 4 store ptr %loadp, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i From 7aec7caca30f800811b76ba94291645494788a4f Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 14 Jan 2025 14:49:45 -0800 Subject: [PATCH 09/82] Add explicit triple to test_type.py. Fixes on 32-bit hosts. --- clang/bindings/python/tests/cindex/test_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py index db7dc6458581e..9bac33f3041f4 100644 --- a/clang/bindings/python/tests/cindex/test_type.py +++ b/clang/bindings/python/tests/cindex/test_type.py @@ -546,7 +546,7 @@ class Template : public A, public B, virtual C { Template instance; int bar; """ - tu = get_tu(source, lang="cpp") + tu = get_tu(source, lang="cpp", flags=["--target=x86_64-linux-gnu"]) cursor = get_cursor(tu, "instance") cursor_type = cursor.type cursor_type_decl = cursor_type.get_declaration() From 2b961b06438d1d07c1d3d3a89bfcdbf877df0d70 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 14 Jan 2025 15:25:08 -0800 Subject: [PATCH 10/82] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#122854) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Source to be nonnull. --- clang/lib/AST/ByteCode/Disasm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index 496c1dcef59b5..1aba778eaf7b9 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -368,10 +368,10 @@ LLVM_DUMP_METHOD void EvaluationResult::dump() const { case LValue: { assert(Source); QualType SourceType; - if (const auto *D = Source.dyn_cast()) { + if (const auto *D = dyn_cast(Source)) { if (const auto *VD = dyn_cast(D)) SourceType = VD->getType(); - } else if (const auto *E = Source.dyn_cast()) { + } else if (const auto *E = dyn_cast(Source)) { SourceType = E->getType(); } From e673f9d00de8a860d98ca7f0ea580ca0fa6a5ac8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 14 Jan 2025 15:25:30 -0800 Subject: [PATCH 11/82] [Sema] Migrate away from PointerUnion::dyn_cast (NFC) (#122855) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect DeclOrVector to be nonnull. --- clang/lib/Sema/SemaCodeComplete.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 1f398bb004fa3..8a848df70cc5a 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -121,8 +121,7 @@ class ResultBuilder { return; } - if (const NamedDecl *PrevND = - DeclOrVector.dyn_cast()) { + if (const NamedDecl *PrevND = dyn_cast(DeclOrVector)) { // 1 -> 2 elements: create the vector of results and push in the // existing declaration. DeclIndexPairVector *Vec = new DeclIndexPairVector; @@ -702,7 +701,7 @@ ResultBuilder::ShadowMapEntry::begin() const { if (DeclOrVector.isNull()) return iterator(); - if (const NamedDecl *ND = DeclOrVector.dyn_cast()) + if (const NamedDecl *ND = dyn_cast(DeclOrVector)) return iterator(ND, SingleDeclIndex); return iterator(cast(DeclOrVector)->begin()); From a1f8ce683a14c847f49f29a450bf838d6ca522a9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 14 Jan 2025 15:26:08 -0800 Subject: [PATCH 12/82] [StaticAnalyzer] Migrate away from PointerUnion::dyn_cast (NFC) (#122856) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Storage to be nonnull. --- clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp index c4af02f21f494..55bcb6e220e1e 100644 --- a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp @@ -226,7 +226,7 @@ void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) { return; } - ExplodedNodeVector *V = Storage.dyn_cast(); + ExplodedNodeVector *V = dyn_cast(Storage); if (!V) { // Switch from single-node to multi-node representation. @@ -251,7 +251,7 @@ unsigned ExplodedNode::NodeGroup::size() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return 0; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->size(); return 1; } @@ -263,7 +263,7 @@ ExplodedNode * const *ExplodedNode::NodeGroup::begin() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return nullptr; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->begin(); return Storage.getAddrOfPtr1(); } @@ -275,7 +275,7 @@ ExplodedNode * const *ExplodedNode::NodeGroup::end() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return nullptr; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->end(); return Storage.getAddrOfPtr1() + 1; } From 466753b1097d64bc2f162bafc1d3c8743ccfd4d3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 14 Jan 2025 15:26:36 -0800 Subject: [PATCH 13/82] [AST] Avoid repeated map lookups (NFC) (#122858) --- clang/lib/AST/ExternalASTMerger.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ExternalASTMerger.cpp b/clang/lib/AST/ExternalASTMerger.cpp index 8bad3b36244e1..7f7816e1b10ea 100644 --- a/clang/lib/AST/ExternalASTMerger.cpp +++ b/clang/lib/AST/ExternalASTMerger.cpp @@ -276,8 +276,8 @@ bool ExternalASTMerger::HasImporterForOrigin(ASTContext &OriginContext) { template void ExternalASTMerger::ForEachMatchingDC(const DeclContext *DC, CallbackType Callback) { - if (Origins.count(DC)) { - ExternalASTMerger::DCOrigin Origin = Origins[DC]; + if (auto It = Origins.find(DC); It != Origins.end()) { + ExternalASTMerger::DCOrigin Origin = It->second; LazyASTImporter &Importer = LazyImporterForOrigin(*this, *Origin.AST); Callback(Importer, Importer.GetReverse(), Origin.DC); } else { From 99ab848a65facc2e1837c459d4ec855cba82d157 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 14 Jan 2025 15:27:00 -0800 Subject: [PATCH 14/82] [CodeGen] Avoid repeated hash lookups (NFC) (#122861) --- llvm/include/llvm/CodeGen/ModuloSchedule.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h index e9f0f089adfef..64598ce449a44 100644 --- a/llvm/include/llvm/CodeGen/ModuloSchedule.h +++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -359,8 +359,8 @@ class PeelingModuloScheduleExpander { MachineBasicBlock *CreateLCSSAExitingBlock(); /// Helper to get the stage of an instruction in the schedule. unsigned getStage(MachineInstr *MI) { - if (CanonicalMIs.count(MI)) - MI = CanonicalMIs[MI]; + if (auto It = CanonicalMIs.find(MI); It != CanonicalMIs.end()) + MI = It->second; return Schedule.getStage(MI); } /// Helper function to find the right canonical register for a phi instruction From bc74625f50e216edd16f436c4fc81ff585b6c4c7 Mon Sep 17 00:00:00 2001 From: zotnhucucbot Date: Wed, 15 Jan 2025 12:59:03 +1300 Subject: [PATCH 15/82] [clang-tidy] Add an option to exclude files not present in the compile database (#120348) A change list may include files that are not part of the compile database, which can cause clang-tidy to fail (e.g., due to missing included headers). To prevent false negatives, we should allow to skip processing these files. --- .../clang-tidy/tool/clang-tidy-diff.py | 36 ++++++++++++++++++- clang-tools-extra/docs/ReleaseNotes.rst | 4 +++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py index 62cb4297c50f7..33de2077dfb1a 100755 --- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py +++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py @@ -35,6 +35,7 @@ import tempfile import threading import traceback +from pathlib import Path try: import yaml @@ -124,6 +125,23 @@ def merge_replacement_files(tmpdir, mergefile): open(mergefile, "w").close() +def get_compiling_files(args): + """Read a compile_commands.json database and return a set of file paths""" + current_dir = Path.cwd() + compile_commands_json = ( + (current_dir / args.build_path) if args.build_path else current_dir + ) + compile_commands_json = compile_commands_json / "compile_commands.json" + files = set() + with open(compile_commands_json) as db_file: + db_json = json.load(db_file) + for entry in db_json: + if "file" not in entry: + continue + files.add(Path(entry["file"])) + return files + + def main(): parser = argparse.ArgumentParser( description="Run clang-tidy against changed files, and " @@ -234,6 +252,13 @@ def main(): action="store_true", help="Allow empty enabled checks.", ) + parser.add_argument( + "-only-check-in-db", + dest="skip_non_compiling", + default=False, + action="store_true", + help="Only check files in the compilation database", + ) clang_tidy_args = [] argv = sys.argv[1:] @@ -243,11 +268,13 @@ def main(): args = parser.parse_args(argv) + compiling_files = get_compiling_files(args) if args.skip_non_compiling else None + # Extract changed lines for each file. filename = None lines_by_file = {} for line in sys.stdin: - match = re.search('^\\+\\+\\+\\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) + match = re.search(r'^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) if match: filename = match.group(2) if filename is None: @@ -260,6 +287,13 @@ def main(): if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE): continue + # Skip any files not in the compiling list + if ( + compiling_files is not None + and (Path.cwd() / filename) not in compiling_files + ): + continue + match = re.search(r"^@@.*\+(\d+)(,(\d+))?", line) if match: start_line = int(match.group(1)) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6350022ed9a8d..8ba47dfc84f26 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -108,6 +108,10 @@ Improvements to clang-query Improvements to clang-tidy -------------------------- +- Improved :program:`clang-tidy-diff.py` script. Add the `-only-check-in-db` + option to exclude files not present in the compilation database, avoiding + false-negative results. + - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise happening on certain platforms when interrupting the script. From b665dddd7070837b11714e28d841c9962a15601a Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Wed, 15 Jan 2025 08:26:46 +0800 Subject: [PATCH 16/82] [AMDGPU] Add tests for v_sat_pk_u8_i16 codegen (#122438) Preparation for #121124 This PR provides tests added into [PR](https://github.com/llvm/llvm-project/pull/121124) that add selection patterns for instruction `v_sat_pk`, in order to specify the change of the tests before and after the commit. Pre-commit tests PR for #121124 : Add selection patterns for instruction `v_sat_pk` --- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 807 +++++++++++++++++++- 1 file changed, 804 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index b919bf0605a12..2d84e87722951 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -2,10 +2,12 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s ; @basic_smax_smin(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smax_smin: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smax_smin: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -76,11 +91,26 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) - %insert.0 = insertelement <2 x i16> undef, i16 %src0.clamp, i32 0 + %insert.0 = insertelement <2 x i16> poison, i16 %src0.clamp, i32 0 %vec = insertelement <2 x i16> %insert.0, i16 %src1.clamp, i32 1 ret <2 x i16> %vec } @@ -128,6 +158,19 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm ; +; SDAG-GFX12-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX12-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, s2, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, s3, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX12-NEXT: s_endpgm +; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -192,6 +235,28 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: basic_smax_smin_sgpr: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, 0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX12-NEXT: s_max_i32 s2, s2, s4 +; GISEL-GFX12-NEXT: s_max_i32 s3, s3, s4 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX12-NEXT: s_min_i32 s2, s2, s5 +; GISEL-GFX12-NEXT: s_min_i32 s3, s3, s5 +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX12-NEXT: s_endpgm + %src0 = trunc i32 %src0ext to i16 %src1 = trunc i32 %src1ext to i16 %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) @@ -235,6 +300,19 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smin_smax: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smin_smax: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -265,6 +343,21 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smin_smax: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255) @@ -305,6 +398,19 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smin_smax_combined: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -335,6 +441,21 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smin_smax_combined: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) @@ -373,6 +494,18 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: vec_smax_smin: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: vec_smax_smin: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -400,6 +533,19 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: vec_smax_smin: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> ) ret <2 x i16> %src.clamp @@ -449,6 +595,17 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm ; +; SDAG-GFX12-LABEL: vec_smax_smin_sgpr: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; SDAG-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, s2, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-GFX12-NEXT: s_endpgm +; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c @@ -521,6 +678,30 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: vec_smax_smin_sgpr: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX12-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX12-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX12-NEXT: s_max_i32 s2, s2, 0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, 0xff00ff +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX12-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX12-NEXT: s_min_i32 s3, s4, s3 +; GISEL-GFX12-NEXT: s_min_i32 s2, s2, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX12-NEXT: s_endpgm + %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> ) store <2 x i16> %src.clamp, ptr addrspace(1) %out @@ -556,6 +737,18 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: vec_smin_smax: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: vec_smin_smax: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -583,9 +776,617 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: vec_smin_smax: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src.min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src.min, <2 x i16> ) ret <2 x i16> %src.clamp } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} +define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_bit_or: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_smax_smin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_bit_or: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) + %src0.and = and i16 %src0.clamp, 255 + %src1.shl = shl i16 %src1.clamp, 8 + %or = or i16 %src0.and, %src1.shl + ret i16 %or +} +define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_umax_umin_bit_or: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_u16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0 +; SDAG-GFX9-NEXT: v_min_u16_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_umax_umin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_u16 v1, 0xff, v1 +; GFX11-NEXT: v_min_u16 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_min_u16 v1, 0xff, v1 +; SDAG-GFX12-NEXT: v_min_u16 v0, 0xff, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_umax_umin_bit_or: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_u16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0 +; GISEL-GFX9-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_min_u16 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_min_u16 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.umax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.umin.i16(i16 %src1.max, i16 255) + %src0.and = and i16 %src0.clamp, 255 + %src1.shl = shl i16 %src1.clamp, 8 + %or = or i16 %src0.and, %src1.shl + ret i16 %or +} +define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_vec_cast: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_cast: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) + %insert.0 = insertelement <2 x i16> undef, i16 %src0.clamp, i32 0 + %vec = insertelement <2 x i16> %insert.0, i16 %src1.clamp, i32 1 + %vec.trunc = trunc <2 x i16> %vec to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} +define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_bit_shl: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX9-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_smax_smin_bit_shl: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_i16 v1, v1, 0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_i16 v1, v1, 0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_bit_shl: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX9-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_i16 v1, v1, 0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.shl = shl i16 %src1.max, 8 + %or = or i16 %src0.clamp, %src1.shl + ret i16 %or +} +define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_vec_input: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_input: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v2, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v2 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %src) + %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %smin) + %vec.trunc = trunc <2 x i16> %smed to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} +define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v0 +; GISEL-VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %smax = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %src) + %smed = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %smax) + %vec.trunc = trunc <2 x i16> %smed to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} + From ebef44067bd0a2cd776b8baea39cffa7f602ce7b Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 14 Jan 2025 20:12:29 -0500 Subject: [PATCH 17/82] [LLVM][Triple] Add an argument to specify canonical form to `Triple::normalize` (#122935) Currently, the output of `Triple::normalize` can vary depending on how the `Triple` object is constructed, producing a 3-field, 4-field, or even 5-field string. However, there is no way to control the format of the output, as all forms are considered canonical according to the LangRef. This lack of control can be inconvenient when a specific format is required. To address this, this PR introduces an argument to specify the desired format (3, 4, or 5 identifiers), with the default set to none to maintain the current behavior. If the requested format requires more components than are available in the actual `Data`, `"unknown"` is appended as needed. --- llvm/include/llvm/TargetParser/Triple.h | 18 ++- llvm/lib/TargetParser/Triple.cpp | 15 ++- llvm/unittests/TargetParser/TripleTest.cpp | 126 +++++++++++++++++++++ 3 files changed, 155 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 76914ab34c1f6..8097300c6e630 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -366,14 +366,26 @@ class Triple { /// @name Normalization /// @{ + /// Canonical form + enum class CanonicalForm { + ANY = 0, + THREE_IDENT = 3, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM + FOUR_IDENT = 4, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT + FIVE_IDENT = 5, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT-FORMAT + }; + /// Turn an arbitrary machine specification into the canonical triple form (or /// something sensible that the Triple class understands if nothing better can /// reasonably be done). In particular, it handles the common case in which - /// otherwise valid components are in the wrong order. - static std::string normalize(StringRef Str); + /// otherwise valid components are in the wrong order. \p Form is used to + /// specify the output canonical form. + static std::string normalize(StringRef Str, + CanonicalForm Form = CanonicalForm::ANY); /// Return the normalized form of this triple's string. - std::string normalize() const { return normalize(Data); } + std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const { + return normalize(Data, Form); + } /// @} /// @name Typed Component Access diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 855889ac05620..ed58e72089839 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1128,7 +1128,7 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) { return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_0); } -std::string Triple::normalize(StringRef Str) { +std::string Triple::normalize(StringRef Str, CanonicalForm Form) { bool IsMinGW32 = false; bool IsCygwin = false; @@ -1334,6 +1334,19 @@ std::string Triple::normalize(StringRef Str) { Components[0] = getDXILArchNameFromShaderModel(Components[2]); } } + + // Canonicalize the components if necessary. + switch (Form) { + case CanonicalForm::ANY: + break; + case CanonicalForm::THREE_IDENT: + case CanonicalForm::FOUR_IDENT: + case CanonicalForm::FIVE_IDENT: { + Components.resize(static_cast(Form), "unknown"); + break; + } + } + // Stick the corrected components back together to form the normalized string. return join(Components, "-"); } diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 3217014aa69af..61b3637bb48e2 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -1417,6 +1417,132 @@ TEST(TripleTest, Normalization) { EXPECT_EQ("x86_64-unknown-linux-gnu", Triple::normalize("x86_64-gnu-linux")); + EXPECT_EQ("a-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::THREE_IDENT)); + + EXPECT_EQ("a-unknown-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-unknown-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-unknown", + Triple::normalize("a-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-d", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-d", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-unknown-unknown-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-unknown-unknown-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-unknown-unknown", + Triple::normalize("a-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-d-unknown", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-d-e", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-b-c-unknown", + Triple::normalize("i386-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-b-c-unknown-unknown", + Triple::normalize("i386-b-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-a-c-unknown", + Triple::normalize("a-i386-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-a-c-unknown-unknown", + Triple::normalize("a-i386-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-a-b-unknown", + Triple::normalize("a-b-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-a-b-c", + Triple::normalize("a-b-c-i386", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-pc-c-unknown", + Triple::normalize("a-pc-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-b-c", + Triple::normalize("pc-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-pc-b-unknown", + Triple::normalize("a-b-pc", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-pc-b-c", + Triple::normalize("a-b-c-pc", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-b-linux-unknown", + Triple::normalize("a-b-linux", Triple::CanonicalForm::FOUR_IDENT)); + // We lose `-c` here as expected. + EXPECT_EQ("unknown-unknown-linux-b", + Triple::normalize("linux-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-unknown-linux-c", + Triple::normalize("a-linux-c", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-pc-a-unknown", + Triple::normalize("a-pc-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-pc-unknown-unknown", + Triple::normalize("-pc-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-linux-c", + Triple::normalize("linux-pc-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-linux-unknown", + Triple::normalize("linux-pc-", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-unknown-unknown-unknown", + Triple::normalize("i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-unknown-unknown", + Triple::normalize("pc", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-unknown-linux-unknown", + Triple::normalize("linux", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ( + "x86_64-unknown-linux-gnu", + Triple::normalize("x86_64-gnu-linux", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-a-b-unknown-unknown", + Triple::normalize("a-b-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("i386-a-b-c-unknown", + Triple::normalize("a-b-c-i386", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("a-pc-c-unknown-unknown", + Triple::normalize("a-pc-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-b-c-unknown", + Triple::normalize("pc-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-pc-b-unknown-unknown", + Triple::normalize("a-b-pc", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-pc-b-c-unknown", + Triple::normalize("a-b-c-pc", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("a-b-linux-unknown-unknown", + Triple::normalize("a-b-linux", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-unknown-linux-b-c", + Triple::normalize("linux-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-unknown-linux-c-unknown", + Triple::normalize("a-linux-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-pc-a-unknown-unknown", + Triple::normalize("a-pc-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("i386-pc-unknown-unknown-unknown", + Triple::normalize("-pc-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-linux-c-unknown", + Triple::normalize("linux-pc-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-linux-unknown-unknown", + Triple::normalize("linux-pc-", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-unknown-unknown-unknown-unknown", + Triple::normalize("i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-unknown-unknown-unknown", + Triple::normalize("pc", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-unknown-linux-unknown-unknown", + Triple::normalize("linux", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ( + "x86_64-unknown-linux-gnu-unknown", + Triple::normalize("x86_64-gnu-linux", Triple::CanonicalForm::FIVE_IDENT)); + // Check that normalizing a permutated set of valid components returns a // triple with the unpermuted components. // From a19919f4cd82166023e81d9ed8df981642c9d4ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 14 Jan 2025 17:38:04 -0800 Subject: [PATCH 18/82] [flang][cuda] Add cuf.device_address operation (#122975) Introduce a new op to get the device address from a host symbol. This simplify the current conversion and this is also in preparation for some legalization work that need to be done in cuf kernel and cuf kernel launch similar to https://github.com/llvm/llvm-project/pull/122802 --- .../flang/Optimizer/Dialect/CUF/CUFOps.td | 12 +++ .../Optimizer/Transforms/CUFOpConversion.cpp | 76 +++++++++++++------ flang/test/Fir/CUDA/cuda-data-transfer.fir | 3 + flang/test/Fir/CUDA/cuda-global-addr.mlir | 1 + flang/test/Fir/CUDA/cuda-launch.fir | 6 +- 5 files changed, 71 insertions(+), 27 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 6f886726b1283..a270e69b39410 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -335,4 +335,16 @@ def cuf_RegisterKernelOp : cuf_Op<"register_kernel", []> { }]; } +def cuf_DeviceAddressOp : cuf_Op<"device_address", []> { + let summary = "Get the device address from a host symbol"; + + let arguments = (ins SymbolRefAttr:$hostSymbol); + + let assemblyFormat = [{ + $hostSymbol attr-dict `->` type($addr) + }]; + + let results = (outs fir_ReferenceType:$addr); +} + #endif // FORTRAN_DIALECT_CUF_CUF_OPS diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index d61d9f63cb294..e93bed37d39f7 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -366,22 +366,47 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { const fir::LLVMTypeConverter *typeConverter; }; -static mlir::Value genGetDeviceAddress(mlir::PatternRewriter &rewriter, - mlir::ModuleOp mod, mlir::Location loc, - mlir::Value inputArg) { - fir::FirOpBuilder builder(rewriter, mod); - mlir::func::FuncOp callee = - fir::runtime::getRuntimeFunc(loc, builder); - auto fTy = callee.getFunctionType(); - mlir::Value conv = createConvertOp(rewriter, loc, fTy.getInput(0), inputArg); - mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); - mlir::Value sourceLine = - fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, conv, sourceFile, sourceLine)}; - auto call = rewriter.create(loc, callee, args); - return createConvertOp(rewriter, loc, inputArg.getType(), call->getResult(0)); -} +struct CUFDeviceAddressOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + CUFDeviceAddressOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symtab) + : OpRewritePattern(context), symTab{symtab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::DeviceAddressOp op, + mlir::PatternRewriter &rewriter) const override { + if (auto global = symTab.lookup( + op.getHostSymbol().getRootReference().getValue())) { + auto mod = op->getParentOfType(); + mlir::Location loc = op.getLoc(); + auto hostAddr = rewriter.create( + loc, fir::ReferenceType::get(global.getType()), op.getHostSymbol()); + fir::FirOpBuilder builder(rewriter, mod); + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value conv = + createConvertOp(rewriter, loc, fTy.getInput(0), hostAddr); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, conv, sourceFile, sourceLine)}; + auto call = rewriter.create(loc, callee, args); + mlir::Value addr = createConvertOp(rewriter, loc, hostAddr.getType(), + call->getResult(0)); + rewriter.replaceOp(op, addr.getDefiningOp()); + return success(); + } + return failure(); + } + +private: + const mlir::SymbolTable &symTab; +}; struct DeclareOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -398,9 +423,8 @@ struct DeclareOpConversion : public mlir::OpRewritePattern { addrOfOp.getSymbol().getRootReference().getValue())) { if (cuf::isRegisteredDeviceGlobal(global)) { rewriter.setInsertionPointAfter(addrOfOp); - auto mod = op->getParentOfType(); - mlir::Value devAddr = genGetDeviceAddress(rewriter, mod, op.getLoc(), - addrOfOp.getResult()); + mlir::Value devAddr = rewriter.create( + op.getLoc(), addrOfOp.getType(), addrOfOp.getSymbol()); rewriter.startOpModification(op); op.getMemrefMutable().assign(devAddr); rewriter.finalizeOpModification(op); @@ -773,7 +797,6 @@ struct CUFLaunchOpConversion } } llvm::SmallVector args; - auto mod = op->getParentOfType(); for (mlir::Value arg : op.getArgs()) { // If the argument is a global descriptor, make sure we pass the device // copy of this descriptor and not the host one. @@ -785,8 +808,11 @@ struct CUFLaunchOpConversion if (auto global = symTab.lookup( addrOfOp.getSymbol().getRootReference().getValue())) { if (cuf::isRegisteredDeviceGlobal(global)) { - arg = genGetDeviceAddress(rewriter, mod, op.getLoc(), - declareOp.getResult()); + arg = rewriter + .create(op.getLoc(), + addrOfOp.getType(), + addrOfOp.getSymbol()) + .getResult(); } } } @@ -907,10 +933,12 @@ void cuf::populateCUFToFIRConversionPatterns( patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index 7203c33e7eb11..5ed27f1be0a43 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -198,6 +198,7 @@ func.func @_QPsub8() attributes {fir.bindc_name = "t"} { // CHECK-LABEL: func.func @_QPsub8() // CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> // CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr @@ -222,6 +223,7 @@ func.func @_QPsub9() { // CHECK-LABEL: func.func @_QPsub9() // CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> // CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr @@ -380,6 +382,7 @@ func.func @_QPdevice_addr_conv() { } // CHECK-LABEL: func.func @_QPdevice_addr_conv() +// CHECK: fir.address_of(@_QMmod1Ea_dev) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmod1Ea_dev) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir index 94ee74736f650..0ccd0c797fb6f 100644 --- a/flang/test/Fir/CUDA/cuda-global-addr.mlir +++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir @@ -26,6 +26,7 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} { } // CHECK-LABEL: func.func @_QQmain() +// CHECK: fir.address_of(@_QMmod1Eadev) : !fir.ref> // CHECK: %[[ADDR:.*]] = fir.address_of(@_QMmod1Eadev) : !fir.ref> // CHECK: %[[ADDRPTR:.*]] = fir.convert %[[ADDR]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[DEVICE_ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[ADDRPTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index 1e19b3bea1296..8432b9ec926e3 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -98,9 +98,9 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e } // CHECK-LABEL: func.func @_QQmain() +// CHECK: _FortranACUFSyncGlobalDescriptor // CHECK: %[[ADDROF:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref>>> -// CHECK: %[[DECL:.*]] = fir.declare %[[ADDROF]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref>>>) -> !fir.ref>>> -// CHECK: %[[CONV_DECL:.*]] = fir.convert %[[DECL]] : (!fir.ref>>>) -> !fir.llvm_ptr -// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_DECL]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr +// CHECK: %[[CONV_ADDR:.*]] = fir.convert %[[ADDROF]] : (!fir.ref>>>) -> !fir.llvm_ptr +// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr) -> !fir.ref>>> // CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) From 565f3bd641dfdfefd9cf932cf94cc3fbd0b30d33 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 15 Jan 2025 09:45:51 +0800 Subject: [PATCH 19/82] [mlir][linalg] Remove redundant checks for variable(NFC) (#122731) This PR removes the redundant checks for the `supported` variable, as it's guaranteed to be true. --- mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 67dd21aafe4fe..8f5b49e0c2130 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3889,7 +3889,7 @@ DiagnosedSilenceableFailure transform::WinogradConv2DOp::applyToOne( << "this operation is not supported to convert to Winograd Conv2D"; } - if (supported && failed(maybeTransformed)) { + if (failed(maybeTransformed)) { return emitSilenceableError() << "apply Winograd Conv2D failed"; } @@ -3927,7 +3927,7 @@ DiagnosedSilenceableFailure transform::DecomposeWinogradOp::applyToOne( return diag; } - if (supported && failed(maybeTransformed)) { + if (failed(maybeTransformed)) { DiagnosedSilenceableFailure diag = emitSilenceableError() << "decompose Winograd operations failed"; diag.attachNote(target->getLoc()) << "target op"; From ef4800c9168ee45ced8295d13ac68f58b4358759 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Tue, 14 Jan 2025 17:58:52 -0800 Subject: [PATCH 20/82] [mlir][Interfaces][NFC] Update doc of ViewLikeOpInterface parser/printer handlers (#122555) This PR addresses part of the feedback provided in #115808. --- .../mlir/Interfaces/ViewLikeInterface.h | 110 ++++++++++++------ mlir/lib/Interfaces/ViewLikeInterface.cpp | 11 +- 2 files changed, 80 insertions(+), 41 deletions(-) diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h index eb046bc742298..8f07e43f847ae 100644 --- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h +++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h @@ -86,62 +86,100 @@ class OpWithOffsetSizesAndStridesConstantArgumentFolder final } }; -/// Printer hook for custom directive in assemblyFormat. +/// Printer hooks for custom directive in assemblyFormat. /// /// custom($values, $integers) /// custom($values, $integers, type($values)) /// -/// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS -/// type `I64ArrayAttr`. Prints a list with either (1) the static integer value -/// in `integers` is `kDynamic` or (2) the next value otherwise. If `valueTypes` -/// is non-empty, it is expected to contain as many elements as `values` -/// indicating their types. This allows idiomatic printing of mixed value and -/// integer attributes in a list. E.g. -/// `[%arg0 : index, 7, 42, %arg42 : i32]`. -/// -/// Indices can be scalable. For example, "4" in "[2, [4], 8]" is scalable. -/// This notation is similar to how scalable dims are marked when defining -/// Vectors. For each value in `integers`, the corresponding `bool` in -/// `scalables` encodes whether it's a scalable index. If `scalableVals` is -/// empty then assume that all indices are non-scalable. +/// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS type +/// `I64ArrayAttr`. Print a list where each element is either: +/// 1. the static integer value in `integers`, if it's not `kDynamic` or, +/// 2. the next value in `values`, otherwise. +/// +/// If `valueTypes` is provided, the corresponding type of each dynamic value is +/// printed. Otherwise, the type is not printed. Each type must match the type +/// of the corresponding value in `values`. `valueTypes` is redundant for +/// printing as we can retrieve the types from the actual `values`. However, +/// `valueTypes` is needed for parsing and we must keep the API symmetric for +/// parsing and printing. The type for integer elements is `i64` by default and +/// never printed. +/// +/// Integer indices can also be scalable in the context of scalable vectors, +/// denoted by square brackets (e.g., "[2, [4], 8]"). For each value in +/// `integers`, the corresponding `bool` in `scalableFlags` encodes whether it's +/// a scalable index. If `scalableFlags` is empty then assume that all indices +/// are non-scalable. +/// +/// Examples: +/// +/// * Input: `integers = [kDynamic, 7, 42, kDynamic]`, +/// `values = [%arg0, %arg42]` and +/// `valueTypes = [index, index]` +/// prints: +/// `[%arg0 : index, 7, 42, %arg42 : i32]` +/// +/// * Input: `integers = [kDynamic, 7, 42, kDynamic]`, +/// `values = [%arg0, %arg42]` and +/// `valueTypes = []` +/// prints: +/// `[%arg0, 7, 42, %arg42]` +/// +/// * Input: `integers = [2, 4, 8]`, +/// `values = []` and +/// `scalableFlags = [false, true, false]` +/// prints: +/// `[2, [4], 8]` +/// void printDynamicIndexList( OpAsmPrinter &printer, Operation *op, OperandRange values, - ArrayRef integers, ArrayRef scalables, + ArrayRef integers, ArrayRef scalableFlags, TypeRange valueTypes = TypeRange(), AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square); inline void printDynamicIndexList( OpAsmPrinter &printer, Operation *op, OperandRange values, ArrayRef integers, TypeRange valueTypes = TypeRange(), AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { - return printDynamicIndexList(printer, op, values, integers, {}, valueTypes, - delimiter); + return printDynamicIndexList(printer, op, values, integers, + /*scalableFlags=*/{}, valueTypes, delimiter); } -/// Parser hook for custom directive in assemblyFormat. +/// Parser hooks for custom directive in assemblyFormat. /// /// custom($values, $integers) /// custom($values, $integers, type($values)) /// /// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS -/// type `I64ArrayAttr`. Parse a mixed list with either (1) static integer -/// values or (2) SSA values. Fill `integers` with the integer ArrayAttr, where -/// `kDynamic` encodes the position of SSA values. Add the parsed SSA values -/// to `values` in-order. If `valueTypes` is non-null, fill it with types -/// corresponding to values; otherwise the caller must handle the types. -/// -/// E.g. after parsing "[%arg0 : index, 7, 42, %arg42 : i32]": -/// 1. `result` is filled with the i64 ArrayAttr "[`kDynamic`, 7, 42, -/// `kDynamic`]" -/// 2. `ssa` is filled with "[%arg0, %arg1]". -/// -/// Indices can be scalable. For example, "4" in "[2, [4], 8]" is scalable. -/// This notation is similar to how scalable dims are marked when defining -/// Vectors. For each value in `integers`, the corresponding `bool` in -/// `scalableVals` encodes whether it's a scalable index. +/// type `I64ArrayAttr`. Parse a mixed list where each element is either a +/// static integer or an SSA value. Fill `integers` with the integer ArrayAttr, +/// where `kDynamic` encodes the position of SSA values. Add the parsed SSA +/// values to `values` in-order. +/// +/// If `valueTypes` is provided, fill it with the types corresponding to each +/// value in `values`. Otherwise, the caller must handle the types and parsing +/// will fail if the type of the value is found (e.g., `[%arg0 : index, 3, %arg1 +/// : index]`). +/// +/// Integer indices can also be scalable in the context of scalable vectors, +/// denoted by square brackets (e.g., "[2, [4], 8]"). For each value in +/// `integers`, the corresponding `bool` in `scalableFlags` encodes whether it's +/// a scalable index. +/// +/// Examples: +/// +/// * After parsing "[%arg0 : index, 7, 42, %arg42 : i32]": +/// 1. `result` is filled with `[kDynamic, 7, 42, kDynamic]` +/// 2. `values` is filled with "[%arg0, %arg1]". +/// 3. `scalableFlags` is filled with `[false, true, false]`. +/// +/// * After parsing `[2, [4], 8]`: +/// 1. `result` is filled with `[2, 4, 8]` +/// 2. `values` is empty. +/// 3. `scalableFlags` is filled with `[false, true, false]`. +/// ParseResult parseDynamicIndexList( OpAsmParser &parser, SmallVectorImpl &values, - DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableVals, + DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableFlags, SmallVectorImpl *valueTypes = nullptr, AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square); inline ParseResult parseDynamicIndexList( @@ -149,8 +187,8 @@ inline ParseResult parseDynamicIndexList( SmallVectorImpl &values, DenseI64ArrayAttr &integers, SmallVectorImpl *valueTypes = nullptr, AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { - DenseBoolArrayAttr scalableVals = {}; - return parseDynamicIndexList(parser, values, integers, scalableVals, + DenseBoolArrayAttr scalableFlags; + return parseDynamicIndexList(parser, values, integers, scalableFlags, valueTypes, delimiter); } diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp index ca33636336bf0..57b5cce7bb13b 100644 --- a/mlir/lib/Interfaces/ViewLikeInterface.cpp +++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp @@ -113,7 +113,8 @@ static char getRightDelimiter(AsmParser::Delimiter delimiter) { void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, OperandRange values, ArrayRef integers, - ArrayRef scalables, TypeRange valueTypes, + ArrayRef scalableFlags, + TypeRange valueTypes, AsmParser::Delimiter delimiter) { char leftDelimiter = getLeftDelimiter(delimiter); char rightDelimiter = getRightDelimiter(delimiter); @@ -126,7 +127,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, unsigned dynamicValIdx = 0; unsigned scalableIndexIdx = 0; llvm::interleaveComma(integers, printer, [&](int64_t integer) { - if (!scalables.empty() && scalables[scalableIndexIdx]) + if (!scalableFlags.empty() && scalableFlags[scalableIndexIdx]) printer << "["; if (ShapedType::isDynamic(integer)) { printer << values[dynamicValIdx]; @@ -136,7 +137,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, } else { printer << integer; } - if (!scalables.empty() && scalables[scalableIndexIdx]) + if (!scalableFlags.empty() && scalableFlags[scalableIndexIdx]) printer << "]"; scalableIndexIdx++; @@ -148,7 +149,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, ParseResult mlir::parseDynamicIndexList( OpAsmParser &parser, SmallVectorImpl &values, - DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalables, + DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableFlags, SmallVectorImpl *valueTypes, AsmParser::Delimiter delimiter) { SmallVector integerVals; @@ -183,7 +184,7 @@ ParseResult mlir::parseDynamicIndexList( return parser.emitError(parser.getNameLoc()) << "expected SSA value or integer"; integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals); - scalables = parser.getBuilder().getDenseBoolArrayAttr(scalableVals); + scalableFlags = parser.getBuilder().getDenseBoolArrayAttr(scalableVals); return success(); } From 0294dab79e24cc4fc41e2d9fc77ad02730e412bc Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 15 Jan 2025 10:10:11 +0800 Subject: [PATCH 21/82] [LV][VPlan] Add fast flags for selectRecipe (#121023) Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of #119847 will add the fastmath flag to for recipe --- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 + .../LoopVectorize/X86/reduction-fastmath.ll | 4 +- .../epilog-vectorization-reductions.ll | 4 +- .../LoopVectorize/select-with-fastflags.ll | 82 +++++++++++++++++++ .../LoopVectorize/vplan-printing.ll | 56 +++++++++++++ 6 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1da185f9cfdf4..87f87bf143719 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1813,11 +1813,10 @@ class VPHistogramRecipe : public VPRecipeBase { }; /// A recipe for widening select instructions. -struct VPWidenSelectRecipe : public VPSingleDefRecipe { +struct VPWidenSelectRecipe : public VPRecipeWithIRFlags { template VPWidenSelectRecipe(SelectInst &I, iterator_range Operands) - : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I, - I.getDebugLoc()) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I) {} ~VPWidenSelectRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4057a51155ece..979a8e0768a99 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1242,6 +1242,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN-SELECT "; printAsOperand(O, SlotTracker); O << " = select "; + printFlags(O); getOperand(0)->printAsOperand(O, SlotTracker); O << ", "; getOperand(1)->printAsOperand(O, SlotTracker); @@ -1266,6 +1267,8 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2)); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel); + if (isa(Sel)) + setFlags(cast(Sel)); State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index 95c74d19dd2db..b4c2a4ae79577 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -358,8 +358,8 @@ define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] -; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8]] = select nnan ninf <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = select nnan ninf <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index c6237170eebb1..54489af8c9f12 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -114,7 +114,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select fast <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -142,7 +142,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI5]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP11]] = select fast <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]] ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll new file mode 100644 index 0000000000000..3a0bb2ac1d9ee --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 + +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +define void @select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; CHECK-LABEL: define void @select_with_fastmath_flags( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+01) +; CHECK-NEXT: [[TMP7:%.*]] = select fast <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], 1.000000e+01 +; CHECK-NEXT: [[COND:%.*]] = select fast i1 [[CMP4]], float [[ADD]], float [[TMP12]] +; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store float [[COND]], ptr [[GEP11]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %gep = getelementptr inbounds nuw float, ptr %b, i64 %iv + %0 = load float, ptr %gep, align 4 + %gep3 = getelementptr inbounds nuw float, ptr %c, i64 %iv + %1 = load float, ptr %gep3, align 4 + %cmp4 = fcmp fast ogt float %0, %1 + %add = fadd fast float %0, 1.000000e+01 + %cond = select fast i1 %cmp4, float %add, float %1 + %gep11 = getelementptr inbounds nuw float, ptr %a, i64 %iv + store float %cond, ptr %gep11, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 5c09ce22cc8fb..00d8de67a3b40 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -1200,6 +1200,62 @@ exit: ret i16 %for.1 } +define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; CHECK-LABEL: 'print_select_with_fastmath_flags' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' +; CHECK-NEXT: Live-in vp<[[VFUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT_EXIT:%.+]]> +; CHECK-NEXT: vp<[[ST:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%b>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR1:%.+]]> = vector-pointer ir<[[GEP1]]> +; CHECK-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; CHECK-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds nuw ir<%c>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR2:%.+]]> = vector-pointer ir<[[GEP2]]> +; CHECK-NEXT: WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]> +; CHECK-NEXT: WIDEN ir<[[FCMP:%.+]]> = fcmp ogt ir<[[LD1]]>, ir<[[LD2]]> +; CHECK-NEXT: WIDEN ir<[[FADD:%.+]]> = fadd reassoc nnan ninf nsz arcp contract afn ir<[[LD1]]>, ir<1.000000e+01> +; CHECK-NEXT: WIDEN-SELECT ir<[[SELECT:%.+]]> = select reassoc nnan ninf nsz arcp contract afn ir<[[FCMP]]>, ir<[[FADD]]>, ir<[[LD2]]> +; CHECK-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds nuw ir<%a>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> +; CHECK-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[SELECT]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %gep = getelementptr inbounds nuw float, ptr %b, i64 %iv + %0 = load float, ptr %gep, align 4 + %gep3 = getelementptr inbounds nuw float, ptr %c, i64 %iv + %1 = load float, ptr %gep3, align 4 + %cmp4 = fcmp fast ogt float %0, %1 + %add = fadd fast float %0, 1.000000e+01 + %cond = select fast i1 %cmp4, float %add, float %1 + %gep11 = getelementptr inbounds nuw float, ptr %a, i64 %iv + store float %cond, ptr %gep11, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} From 2a5281d0e0000c04606ef86a2cf9c458d9adafef Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 15 Jan 2025 13:20:19 +1100 Subject: [PATCH 22/82] [ORC-RT] Fix missing '\' line continuations in objc-imageinfo.S test. These missing continuations were causing commands in this testcase to fail. --- .../TestCases/Darwin/x86-64/objc-imageinfo.S | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S index 2d0d8d8c19af4..ae02ada4032fd 100644 --- a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S +++ b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S @@ -20,17 +20,17 @@ // NEW: MachOPlatform: Registered __objc_imageinfo for main // NEW-SAME: flags = 0x0040 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_4.o 2>&1 +// RUN: %t/swift_4.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_4 // SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_4-SAME: flags = 0x0640 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_5.o 2>&1 +// RUN: %t/swift_5.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_5 // SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_5-SAME: flags = 0x5000740 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_59.o 2>&1 +// RUN: %t/swift_59.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_59 // SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_59-SAME: flags = 0x5090740 @@ -50,25 +50,25 @@ // Add swift to objc. // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \ -// RUN: %t/objc_new.o 2>&1 +// RUN: %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX2 // SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740 // Add multiple swift to objc. // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \ -// RUN: %t/swift_5.o %t/objc_new.o 2>&1 +// RUN: %t/swift_5.o %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX3 // SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740 // Disable categories. -// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o -// RUN: %t/objc_new.o 2>&1 +// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o \ +// RUN: %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX4 // SWIFT_MIX4: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0000 // Disable signed class_ro. -// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o -// RUN: %t/objc_new_signed_ro.o 2>&1 +// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o \ +// RUN: %t/objc_new_signed_ro.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX5 // SWIFT_MIX5: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0040 From e7f756d4684af4531a6a9564017bcae1226b719f Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 14 Jan 2025 18:22:54 -0800 Subject: [PATCH 23/82] [NFC][BoundsChecking] Address #122576 review comments (#122773) --- clang/lib/CodeGen/BackendUtil.cpp | 3 ++- llvm/lib/Passes/PassBuilder.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 79e6bf3d24dff..3951ad01497cc 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1032,7 +1032,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( static_assert(SanitizerKind::SO_LocalBounds <= std::numeric_limits< decltype(Options.GuardKind)::value_type>::max(), - "Update type of llvm.allow.ubsan.check."); + "Update type of llvm.allow.ubsan.check to represent " + "SanitizerKind::SO_LocalBounds."); Options.GuardKind = SanitizerKind::SO_LocalBounds; } Options.Merge = diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 94782547325ed..f698a3df08ef7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1320,7 +1320,7 @@ parseBoundsCheckingOptions(StringRef Params) { StringRef ParamEQ; StringRef Val; std::tie(ParamEQ, Val) = ParamName.split('='); - int8_t Id = 0; + int8_t Id; if (ParamEQ == "guard" && !Val.getAsInteger(0, Id)) { Options.GuardKind = Id; } else { From 21ade5ae2978b7b809b59b70d63099c87b36dc61 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 15 Jan 2025 11:12:23 +1100 Subject: [PATCH 24/82] [JITLink] Fix indentation in debugging output. --- llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index 8b6c88da52eb8..540dfdad5831b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -361,14 +361,14 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, // Add a keep-alive edge from the FDE target to the FDE to ensure that the // FDE is kept alive if its target is. LLVM_DEBUG({ - dbgs() << " Adding keep-alive edge from target at " + dbgs() << " Adding keep-alive edge from target at " << (*PCBegin)->getBlock().getAddress() << " to FDE at " << RecordAddress << "\n"; }); (*PCBegin)->getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0); } else { LLVM_DEBUG({ - dbgs() << " WARNING: Not adding keep-alive edge to FDE at " + dbgs() << " WARNING: Not adding keep-alive edge to FDE at " << RecordAddress << ", which points to " << ((*PCBegin)->isExternal() ? "external" : "absolute") << " symbol \"" << (*PCBegin)->getName() @@ -395,7 +395,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, .takeError()) return Err; } else { - LLVM_DEBUG(dbgs() << " Record does not have LSDA field.\n"); + LLVM_DEBUG(dbgs() << " Record does not have LSDA field.\n"); } return Error::success(); From 9c5001e45491ae8b1b2967d2fa48f445799c88ae Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 15 Jan 2025 12:33:29 +1100 Subject: [PATCH 25/82] [JITLink] Add convenience methods to LinkGraph to find symbols by name. Adds new convenience methods findDefinedSymbolByName, findExternalSymbolByName and findAbsoluteSymbolByName to the LinkGraph class. These should be used to find symbols of the given types by name. COFFLinkGraphBuilder and MachOPlatform are updated to take advantage of the new methods. --- .../llvm/ExecutionEngine/JITLink/JITLink.h | 25 +++++++++++ .../JITLink/COFFLinkGraphBuilder.cpp | 34 ++++---------- .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 40 +++++++---------- .../JITLink/LinkGraphTests.cpp | 45 +++++++++++++++++++ 4 files changed, 96 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 2af9119670141..67bcb00787312 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -1387,10 +1387,26 @@ class LinkGraph { GetExternalSymbolMapEntryValue())); } + /// Returns the external symbol with the given name if one exists, otherwise + /// returns nullptr. + Symbol *findExternalSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : external_symbols()) + if (Sym->getName() == Name) + return Sym; + return nullptr; + } + iterator_range absolute_symbols() { return make_range(AbsoluteSymbols.begin(), AbsoluteSymbols.end()); } + Symbol *findAbsoluteSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : absolute_symbols()) + if (Sym->getName() == Name) + return Sym; + return nullptr; + } + iterator_range defined_symbols() { auto Secs = sections(); return make_range(defined_symbol_iterator(Secs.begin(), Secs.end()), @@ -1403,6 +1419,15 @@ class LinkGraph { const_defined_symbol_iterator(Secs.end(), Secs.end())); } + /// Returns the defined symbol with the given name if one exists, otherwise + /// returns nullptr. + Symbol *findDefinedSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : defined_symbols()) + if (Sym->hasName() && Sym->getName() == Name) + return Sym; + return nullptr; + } + /// Make the given symbol external (must not already be external). /// /// Symbol size, linkage and callability will be left unchanged. Symbol scope diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp index d3315aad126cb..e898d336dbe40 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -635,32 +635,16 @@ Symbol *GetImageBaseSymbol::operator()(LinkGraph &G) { return *ImageBase; auto IBN = G.intern(ImageBaseName); + ImageBase = G.findExternalSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; + ImageBase = G.findAbsoluteSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; + ImageBase = G.findDefinedSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; - // Check external symbols for image base. - for (auto *Sym : G.external_symbols()) { - if (Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - // Check absolute symbols (unexpected, but legal). - for (auto *Sym : G.absolute_symbols()) { - if (Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - // Finally, check defined symbols. - for (auto *Sym : G.defined_symbols()) { - if (Sym->hasName() && Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - ImageBase = nullptr; return nullptr; } diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index e0d40cf2de5aa..8e66d028f21ce 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -1492,26 +1492,19 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject( memcpy(SD.Sec.sectname, "__objc_imageinfo", 16); strcpy(SD.Sec.segname, "__DATA"); SD.Sec.size = 8; - SD.AddFixups = [&](size_t RecordOffset) { + jitlink::Symbol *ObjCImageInfoSym = nullptr; + SD.AddFixups = [&, ObjCImageInfoSym](size_t RecordOffset) mutable { auto PointerEdge = getPointerEdgeKind(G); // Look for an existing __objc_imageinfo symbol. - jitlink::Symbol *ObjCImageInfoSym = nullptr; - for (auto *Sym : G.external_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; - break; - } - if (!ObjCImageInfoSym) - for (auto *Sym : G.absolute_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; - break; - } - if (!ObjCImageInfoSym) - for (auto *Sym : G.defined_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; + if (!ObjCImageInfoSym) { + auto Name = G.intern(ObjCImageInfoSymbolName); + ObjCImageInfoSym = G.findExternalSymbolByName(Name); + if (!ObjCImageInfoSym) + ObjCImageInfoSym = G.findAbsoluteSymbolByName(Name); + if (!ObjCImageInfoSym) { + ObjCImageInfoSym = G.findDefinedSymbolByName(Name); + if (ObjCImageInfoSym) { std::optional Flags; { std::lock_guard Lock(PluginMutex); @@ -1525,16 +1518,17 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject( if (Flags) { // We own the definition of __objc_image_info; write the final // merged flags value. - auto Content = Sym->getBlock().getMutableContent(G); - assert(Content.size() == 8 && + auto Content = ObjCImageInfoSym->getBlock().getMutableContent(G); + assert( + Content.size() == 8 && "__objc_image_info size should have been verified already"); support::endian::write32(&Content[4], *Flags, G.getEndianness()); } - break; } - if (!ObjCImageInfoSym) - ObjCImageInfoSym = - &G.addExternalSymbol(ObjCImageInfoSymbolName, 8, false); + } + if (!ObjCImageInfoSym) + ObjCImageInfoSym = &G.addExternalSymbol(std::move(Name), 8, false); + } SecBlock.addEdge(PointerEdge, RecordOffset + ((char *)&SD.Sec.addr - (char *)&SD.Sec), diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp index 11a379c7e5024..ff6cf49bb9758 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp @@ -217,6 +217,51 @@ TEST(LinkGraphTest, ContentAccessAndUpdate) { [](char C) { return C == 0; })); } +TEST(LinkGraphTest, FindSymbolsByName) { + // Check that we can make defined and absolute symbols external. + LinkGraph G("foo", std::make_shared(), + Triple("x86_64-apple-darwin"), SubtargetFeatures(), + getGenericEdgeKindName); + auto &Sec = + G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write); + + auto &B1 = + G.createContentBlock(Sec, BlockContent, orc::ExecutorAddr(0x1000), 8, 0); + + // Add an anonymous symbol to make sure that these don't disrupt by-name + // lookup of defined symbols. + G.addAnonymousSymbol(B1, 0, 0, false, false); + + // Add named defined, external and absolute symbols. + auto Foo = G.intern("foo"); + auto &FooSym = G.addDefinedSymbol(B1, 0, Foo, 4, Linkage::Strong, + Scope::Default, false, false); + + auto Bar = G.intern("bar"); + auto &BarSym = G.addExternalSymbol(Bar, 0, false); + + auto Baz = G.intern("baz"); + auto &BazSym = G.addAbsoluteSymbol(Baz, orc::ExecutorAddr(0x1234), 0, + Linkage::Strong, Scope::Default, true); + + EXPECT_EQ(G.findDefinedSymbolByName(Foo), &FooSym); + EXPECT_EQ(G.findExternalSymbolByName(Foo), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Foo), nullptr); + + EXPECT_EQ(G.findDefinedSymbolByName(Bar), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Bar), &BarSym); + EXPECT_EQ(G.findAbsoluteSymbolByName(Bar), nullptr); + + EXPECT_EQ(G.findDefinedSymbolByName(Baz), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Baz), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Baz), &BazSym); + + auto Qux = G.intern("qux"); + EXPECT_EQ(G.findDefinedSymbolByName(Qux), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Qux), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Qux), nullptr); +} + TEST(LinkGraphTest, MakeExternal) { // Check that we can make defined and absolute symbols external. LinkGraph G("foo", std::make_shared(), From 9f48bb637eeb40e40bb9e2927ea97b58684ece3a Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 15 Jan 2025 14:20:07 +1100 Subject: [PATCH 26/82] [JITLink] Document EHFrameEdgeFixer's handling of implicit relocations. NFC. On platfarms where some relocations for eh-frame sections are implicit (e.g. MachO/x86-64) EHFrameEdgeFixer is responsible for adding edges for the implicit relocations. --- llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h index 49fbf650e7a77..841ec9c055a2e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h @@ -26,6 +26,9 @@ namespace jitlink { class EHFrameEdgeFixer { public: /// Create an eh-frame edge fixer. + /// Adds edges for implicit relocations on platforms where these are used + /// (e.g. MachO/x86-64). + /// /// If a given edge-kind is not supported on the target architecture then /// Edge::Invalid should be used. EHFrameEdgeFixer(StringRef EHFrameSectionName, unsigned PointerSize, From da4ac13acac166738f769a9dfa2be40563d6ff64 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 14 Jan 2025 20:04:27 -0800 Subject: [PATCH 27/82] [RISCV][llvm-exegesis] Simplify copying a SmallVector to a std::vector. NFC (#122988) --- llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp index 5636782bdf7f6..d55db9af8a9bd 100644 --- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -32,13 +32,11 @@ namespace { static std::vector loadIntReg(const MCSubtargetInfo &STI, unsigned Reg, const APInt &Value) { SmallVector MCInstSeq; - std::vector MatIntInstrs; MCRegister DestReg = Reg; RISCVMatInt::generateMCInstSeq(Value.getSExtValue(), STI, DestReg, MCInstSeq); - MatIntInstrs.resize(MCInstSeq.size()); - std::copy(MCInstSeq.begin(), MCInstSeq.end(), MatIntInstrs.begin()); + std::vector MatIntInstrs(MCInstSeq.begin(), MCInstSeq.end()); return MatIntInstrs; } From 9ac6a55ec54fe4cd4a99c69ef1a4ddaea49e6688 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Tue, 14 Jan 2025 23:06:51 -0500 Subject: [PATCH 28/82] [clang][AST] Assert that DependentNameType's Name and NNS are not null (#122418) Also clarify the comment above DependentNameType::getIdentifier() --- clang/include/clang/AST/Type.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 78677df578c4b..f0fbacccc97bb 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -7040,17 +7040,17 @@ class DependentNameType : public TypeWithKeyword, public llvm::FoldingSetNode { : TypeWithKeyword(Keyword, DependentName, CanonType, TypeDependence::DependentInstantiation | toTypeDependence(NNS->getDependence())), - NNS(NNS), Name(Name) {} + NNS(NNS), Name(Name) { + assert(NNS); + assert(Name); + } public: /// Retrieve the qualification on this type. NestedNameSpecifier *getQualifier() const { return NNS; } - /// Retrieve the type named by the typename specifier as an identifier. - /// - /// This routine will return a non-NULL identifier pointer when the - /// form of the original typename was terminated by an identifier, - /// e.g., "typename T::type". + /// Retrieve the identifier that terminates this type name. + /// For example, "type" in "typename T::type". const IdentifierInfo *getIdentifier() const { return Name; } From c4fb7180cbbe977f1ab1ce945a691550f8fdd1fb Mon Sep 17 00:00:00 2001 From: Greg Clayton Date: Tue, 14 Jan 2025 20:12:46 -0800 Subject: [PATCH 29/82] [lldb][NFC] Make the target's SectionLoadList private. (#113278) Lots of code around LLDB was directly accessing the target's section load list. This NFC patch makes the section load list private so the Target class can access it, but everyone else now uses accessor functions. This allows us to control the resolving of addresses and will allow for functionality in LLDB which can lazily resolve addresses in JIT plug-ins with a future patch. --- lldb/include/lldb/Target/SectionLoadHistory.h | 2 +- lldb/include/lldb/Target/Target.h | 17 ++++++++++--- lldb/source/API/SBBreakpoint.cpp | 4 +-- .../Breakpoint/BreakpointLocationList.cpp | 3 +-- .../Commands/CommandObjectDisassemble.cpp | 6 ++--- .../source/Commands/CommandObjectRegister.cpp | 4 +-- lldb/source/Commands/CommandObjectSource.cpp | 9 +++---- lldb/source/Commands/CommandObjectTarget.cpp | 15 ++++++----- lldb/source/Core/Address.cpp | 10 ++++---- lldb/source/Core/Disassembler.cpp | 10 ++++---- lldb/source/Core/DumpDataExtractor.cpp | 10 +++----- lldb/source/Core/FormatEntity.cpp | 2 +- lldb/source/Core/Section.cpp | 5 ++-- lldb/source/Core/Value.cpp | 5 ++-- .../DataFormatters/CXXFunctionPointer.cpp | 5 ++-- lldb/source/Expression/ObjectFileJIT.cpp | 4 +-- .../Architecture/Mips/ArchitectureMips.cpp | 3 +-- .../Disassembler/LLVMC/DisassemblerLLVMC.cpp | 6 ++--- .../MacOSX-DYLD/DynamicLoaderMacOS.cpp | 2 +- .../Static/DynamicLoaderStatic.cpp | 4 +-- .../TSan/InstrumentationRuntimeTSan.cpp | 9 +++---- .../Plugins/JITLoader/GDB/JITLoaderGDB.cpp | 2 +- .../CPlusPlus/CPPLanguageRuntime.cpp | 18 ++++++------- .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 3 +-- .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 10 ++++---- .../ObjectFile/PECOFF/ObjectFilePECOFF.cpp | 2 +- .../Placeholder/ObjectFilePlaceholder.cpp | 3 +-- .../Process/minidump/ProcessMinidump.cpp | 4 +-- .../intel-pt/TraceIntelPTBundleSaver.cpp | 3 +-- lldb/source/Symbol/ObjectFile.cpp | 3 +-- lldb/source/Target/ProcessTrace.cpp | 2 +- lldb/source/Target/SectionLoadHistory.cpp | 6 +++-- lldb/source/Target/Target.cpp | 25 ++++++++++++++++--- lldb/source/Target/ThreadPlanStepInRange.cpp | 3 +-- lldb/source/Target/ThreadPlanTracer.cpp | 3 +-- 35 files changed, 116 insertions(+), 106 deletions(-) diff --git a/lldb/include/lldb/Target/SectionLoadHistory.h b/lldb/include/lldb/Target/SectionLoadHistory.h index 64bb828d4254a..4380d6f2cf121 100644 --- a/lldb/include/lldb/Target/SectionLoadHistory.h +++ b/lldb/include/lldb/Target/SectionLoadHistory.h @@ -45,7 +45,7 @@ class SectionLoadHistory { const lldb::SectionSP §ion_sp); bool ResolveLoadAddress(uint32_t stop_id, lldb::addr_t load_addr, - Address &so_addr); + Address &so_addr, bool allow_section_end = false); bool SetSectionLoadAddress(uint32_t stop_id, const lldb::SectionSP §ion_sp, diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 0d1943450d622..f31ac381391b4 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -1151,9 +1151,13 @@ class Target : public std::enable_shared_from_this, Address &pointer_addr, bool force_live_memory = false); - SectionLoadList &GetSectionLoadList() { - return m_section_load_history.GetCurrentSectionLoadList(); - } + bool HasLoadedSections(); + + lldb::addr_t GetSectionLoadAddress(const lldb::SectionSP §ion_sp); + + void ClearSectionLoadList(); + + void DumpSectionLoadList(Stream &s); static Target *GetTargetFromContexts(const ExecutionContext *exe_ctx_ptr, const SymbolContext *sc_ptr); @@ -1218,7 +1222,8 @@ class Target : public std::enable_shared_from_this, bool ResolveFileAddress(lldb::addr_t load_addr, Address &so_addr); bool ResolveLoadAddress(lldb::addr_t load_addr, Address &so_addr, - uint32_t stop_id = SectionLoadHistory::eStopIDNow); + uint32_t stop_id = SectionLoadHistory::eStopIDNow, + bool allow_section_end = false); bool SetSectionLoadAddress(const lldb::SectionSP §ion, lldb::addr_t load_addr, @@ -1666,6 +1671,10 @@ class Target : public std::enable_shared_from_this, Target(const Target &) = delete; const Target &operator=(const Target &) = delete; + + SectionLoadList &GetSectionLoadList() { + return m_section_load_history.GetCurrentSectionLoadList(); + } }; } // namespace lldb_private diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp index b2ed034d19983..87fadbcec4f26 100644 --- a/lldb/source/API/SBBreakpoint.cpp +++ b/lldb/source/API/SBBreakpoint.cpp @@ -137,7 +137,7 @@ SBBreakpointLocation SBBreakpoint::FindLocationByAddress(addr_t vm_addr) { bkpt_sp->GetTarget().GetAPIMutex()); Address address; Target &target = bkpt_sp->GetTarget(); - if (!target.GetSectionLoadList().ResolveLoadAddress(vm_addr, address)) { + if (!target.ResolveLoadAddress(vm_addr, address)) { address.SetRawAddress(vm_addr); } sb_bp_location.SetLocation(bkpt_sp->FindLocationByAddress(address)); @@ -157,7 +157,7 @@ break_id_t SBBreakpoint::FindLocationIDByAddress(addr_t vm_addr) { bkpt_sp->GetTarget().GetAPIMutex()); Address address; Target &target = bkpt_sp->GetTarget(); - if (!target.GetSectionLoadList().ResolveLoadAddress(vm_addr, address)) { + if (!target.ResolveLoadAddress(vm_addr, address)) { address.SetRawAddress(vm_addr); } break_id = bkpt_sp->FindLocationIDByAddress(address); diff --git a/lldb/source/Breakpoint/BreakpointLocationList.cpp b/lldb/source/Breakpoint/BreakpointLocationList.cpp index e0f1b9b2c8088..0240305d6f292 100644 --- a/lldb/source/Breakpoint/BreakpointLocationList.cpp +++ b/lldb/source/Breakpoint/BreakpointLocationList.cpp @@ -103,8 +103,7 @@ BreakpointLocationList::FindByAddress(const Address &addr) const { so_addr = addr; } else { // Try and resolve as a load address if possible. - m_owner.GetTarget().GetSectionLoadList().ResolveLoadAddress( - addr.GetOffset(), so_addr); + m_owner.GetTarget().ResolveLoadAddress(addr.GetOffset(), so_addr); if (!so_addr.IsValid()) { // The address didn't resolve, so just set to passed in addr. so_addr = addr; diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp index 6db4b2665bd84..5b131fe86dedb 100644 --- a/lldb/source/Commands/CommandObjectDisassemble.cpp +++ b/lldb/source/Commands/CommandObjectDisassemble.cpp @@ -269,10 +269,10 @@ CommandObjectDisassemble::GetContainingAddressRanges() { }; Target &target = GetTarget(); - if (!target.GetSectionLoadList().IsEmpty()) { + if (target.HasLoadedSections()) { Address symbol_containing_address; - if (target.GetSectionLoadList().ResolveLoadAddress( - m_options.symbol_containing_addr, symbol_containing_address)) { + if (target.ResolveLoadAddress(m_options.symbol_containing_addr, + symbol_containing_address)) { get_range(symbol_containing_address); } } else { diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp index 4e047ccbc10b9..fbb92e5c63877 100644 --- a/lldb/source/Commands/CommandObjectRegister.cpp +++ b/lldb/source/Commands/CommandObjectRegister.cpp @@ -95,8 +95,8 @@ class CommandObjectRegisterRead : public CommandObjectParsed { addr_t reg_addr = reg_value.GetAsUInt64(LLDB_INVALID_ADDRESS); if (reg_addr != LLDB_INVALID_ADDRESS) { Address so_reg_addr; - if (exe_ctx.GetTargetRef().GetSectionLoadList().ResolveLoadAddress( - reg_addr, so_reg_addr)) { + if (exe_ctx.GetTargetRef().ResolveLoadAddress(reg_addr, + so_reg_addr)) { strm.PutCString(" "); so_reg_addr.Dump(&strm, exe_ctx.GetBestExecutionContextScope(), Address::DumpStyleResolvedDescription); diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp index c8295fd10cf22..936783216f6ff 100644 --- a/lldb/source/Commands/CommandObjectSource.cpp +++ b/lldb/source/Commands/CommandObjectSource.cpp @@ -302,7 +302,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed { size_t num_matches = 0; assert(module_list.GetSize() > 0); Target &target = GetTarget(); - if (target.GetSectionLoadList().IsEmpty()) { + if (!target.HasLoadedSections()) { // The target isn't loaded yet, we need to lookup the file address in all // modules. Note: the module list option does not apply to addresses. const size_t num_modules = module_list.GetSize(); @@ -328,7 +328,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed { } else { // The target has some things loaded, resolve this address to a compile // unit + file + line and display - if (target.GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) { + if (target.ResolveLoadAddress(addr, so_addr)) { ModuleSP module_sp(so_addr.GetModule()); // Check to make sure this module is in our list. if (module_sp && module_list.GetIndexForModule(module_sp.get()) != @@ -959,7 +959,7 @@ class CommandObjectSourceList : public CommandObjectParsed { StreamString error_strm; SymbolContextList sc_list; - if (target.GetSectionLoadList().IsEmpty()) { + if (!target.HasLoadedSections()) { // The target isn't loaded yet, we need to lookup the file address in // all modules const ModuleList &module_list = target.GetImages(); @@ -987,8 +987,7 @@ class CommandObjectSourceList : public CommandObjectParsed { } else { // The target has some things loaded, resolve this address to a compile // unit + file + line and display - if (target.GetSectionLoadList().ResolveLoadAddress(m_options.address, - so_addr)) { + if (target.ResolveLoadAddress(m_options.address, so_addr)) { ModuleSP module_sp(so_addr.GetModule()); if (module_sp) { SymbolContext sc; diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 307f4f683e3b2..d8265e41a7384 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -1522,8 +1522,8 @@ static bool LookupAddressInModule(CommandInterpreter &interpreter, Stream &strm, Address so_addr; SymbolContext sc; Target *target = interpreter.GetExecutionContext().GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - if (!target->GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) + if (target && target->HasLoadedSections()) { + if (!target->ResolveLoadAddress(addr, so_addr)) return false; else if (so_addr.GetModule().get() != module) return false; @@ -2974,8 +2974,8 @@ class CommandObjectTargetModulesLoad sect_name); break; } else { - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, load_addr)) + if (target.SetSectionLoadAddress(section_sp, + load_addr)) changed = true; result.AppendMessageWithFormat( "section '%s' loaded at 0x%" PRIx64 "\n", @@ -3329,7 +3329,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed { if (objfile) { Address base_addr(objfile->GetBaseAddress()); if (base_addr.IsValid()) { - if (!target.GetSectionLoadList().IsEmpty()) { + if (target.HasLoadedSections()) { lldb::addr_t load_addr = base_addr.GetLoadAddress(&target); if (load_addr == LLDB_INVALID_ADDRESS) { base_addr.Dump(&strm, &target, @@ -3544,8 +3544,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { function_options, sc_list); } else if (m_options.m_type == eLookupTypeAddress && target) { Address addr; - if (target->GetSectionLoadList().ResolveLoadAddress(m_options.m_addr, - addr)) { + if (target->ResolveLoadAddress(m_options.m_addr, addr)) { SymbolContext sc; ModuleSP module_sp(addr.GetModule()); module_sp->ResolveSymbolContextForAddress(addr, @@ -5270,7 +5269,7 @@ class CommandObjectTargetDumpSectionLoadList : public CommandObjectParsed { protected: void DoExecute(Args &command, CommandReturnObject &result) override { Target &target = GetTarget(); - target.GetSectionLoadList().Dump(result.GetOutputStream(), &target); + target.DumpSectionLoadList(result.GetOutputStream()); result.SetStatus(eReturnStatusSuccessFinishResult); } }; diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp index 5a4751bd5256e..1dab874a96583 100644 --- a/lldb/source/Core/Address.cpp +++ b/lldb/source/Core/Address.cpp @@ -138,9 +138,8 @@ static bool ReadAddress(ExecutionContextScope *exe_scope, // If we have any sections that are loaded, try and resolve using the // section load list Target *target = exe_ctx.GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - if (target->GetSectionLoadList().ResolveLoadAddress(deref_addr, - deref_so_addr)) + if (target && target->HasLoadedSections()) { + if (target->ResolveLoadAddress(deref_addr, deref_so_addr)) return true; } else { // If we were not running, yet able to read an integer, we must have a @@ -1046,8 +1045,9 @@ AddressClass Address::GetAddressClass() const { bool Address::SetLoadAddress(lldb::addr_t load_addr, Target *target, bool allow_section_end) { - if (target && target->GetSectionLoadList().ResolveLoadAddress( - load_addr, *this, allow_section_end)) + if (target && target->ResolveLoadAddress(load_addr, *this, + SectionLoadHistory::eStopIDNow, + allow_section_end)) return true; m_section_wp.reset(); m_offset = load_addr; diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp index 68e52144eb6ef..b3e7c4c13061d 100644 --- a/lldb/source/Core/Disassembler.cpp +++ b/lldb/source/Core/Disassembler.cpp @@ -107,11 +107,11 @@ static Address ResolveAddress(Target &target, const Address &addr) { Address resolved_addr; // If we weren't passed in a section offset address range, try and resolve // it to something - bool is_resolved = target.GetSectionLoadList().IsEmpty() - ? target.GetImages().ResolveFileAddress( - addr.GetOffset(), resolved_addr) - : target.GetSectionLoadList().ResolveLoadAddress( - addr.GetOffset(), resolved_addr); + bool is_resolved = + target.HasLoadedSections() + ? target.ResolveLoadAddress(addr.GetOffset(), resolved_addr) + : target.GetImages().ResolveFileAddress(addr.GetOffset(), + resolved_addr); // We weren't able to resolve the address, just treat it as a raw address if (is_resolved && resolved_addr.IsValid()) diff --git a/lldb/source/Core/DumpDataExtractor.cpp b/lldb/source/Core/DumpDataExtractor.cpp index 565ee3a0ae40a..72140736d8877 100644 --- a/lldb/source/Core/DumpDataExtractor.cpp +++ b/lldb/source/Core/DumpDataExtractor.cpp @@ -136,10 +136,10 @@ static lldb::offset_t DumpInstructions(const DataExtractor &DE, Stream *s, lldb::addr_t addr = base_addr + start_offset; lldb_private::Address so_addr; bool data_from_file = true; - if (target_sp->GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) { + if (target_sp->ResolveLoadAddress(addr, so_addr)) { data_from_file = false; } else { - if (target_sp->GetSectionLoadList().IsEmpty() || + if (!target_sp->HasLoadedSections() || !target_sp->GetImages().ResolveFileAddress(addr, so_addr)) so_addr.SetRawAddress(addr); } @@ -707,8 +707,7 @@ lldb::offset_t lldb_private::DumpDataExtractor( TargetSP target_sp(exe_scope->CalculateTarget()); lldb_private::Address so_addr; if (target_sp) { - if (target_sp->GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) { + if (target_sp->ResolveLoadAddress(addr, so_addr)) { s->PutChar(' '); so_addr.Dump(s, exe_scope, Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); @@ -719,8 +718,7 @@ lldb::offset_t lldb_private::DumpDataExtractor( if (ProcessSP process_sp = exe_scope->CalculateProcess()) { if (ABISP abi_sp = process_sp->GetABI()) { addr_t addr_fixed = abi_sp->FixCodeAddress(addr); - if (target_sp->GetSectionLoadList().ResolveLoadAddress( - addr_fixed, so_addr)) { + if (target_sp->ResolveLoadAddress(addr_fixed, so_addr)) { s->PutChar(' '); s->Printf("(0x%*.*" PRIx64 ")", (int)(2 * item_byte_size), (int)(2 * item_byte_size), addr_fixed); diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp index d76fc97caa013..e13284832cf57 100644 --- a/lldb/source/Core/FormatEntity.cpp +++ b/lldb/source/Core/FormatEntity.cpp @@ -412,7 +412,7 @@ static bool DumpAddressAndContent(Stream &s, const SymbolContext *sc, Target *target = Target::GetTargetFromContexts(exe_ctx, sc); addr_t vaddr = LLDB_INVALID_ADDRESS; - if (target && !target->GetSectionLoadList().IsEmpty()) + if (target && target->HasLoadedSections()) vaddr = addr.GetLoadAddress(target); if (vaddr == LLDB_INVALID_ADDRESS) vaddr = addr.GetFileAddress(); diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp index 31273ede618f2..a17f43fe89033 100644 --- a/lldb/source/Core/Section.cpp +++ b/lldb/source/Core/Section.cpp @@ -238,7 +238,7 @@ addr_t Section::GetLoadBaseAddress(Target *target) const { load_base_addr += GetOffset(); } if (load_base_addr == LLDB_INVALID_ADDRESS) { - load_base_addr = target->GetSectionLoadList().GetSectionLoadAddress( + load_base_addr = target->GetSectionLoadAddress( const_cast
(this)->shared_from_this()); } return load_base_addr; @@ -643,8 +643,7 @@ bool SectionList::ContainsSection(user_id_t sect_id) const { void SectionList::Dump(llvm::raw_ostream &s, unsigned indent, Target *target, bool show_header, uint32_t depth) const { - bool target_has_loaded_sections = - target && !target->GetSectionLoadList().IsEmpty(); + bool target_has_loaded_sections = target && target->HasLoadedSections(); if (show_header && !m_sections.empty()) { s.indent(indent); s << llvm::formatv( diff --git a/lldb/source/Core/Value.cpp b/lldb/source/Core/Value.cpp index bd93c04c16d24..70299cb8455a1 100644 --- a/lldb/source/Core/Value.cpp +++ b/lldb/source/Core/Value.cpp @@ -364,10 +364,9 @@ Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data, // memory sections loaded. This allows you to use "target modules // load" to load your executable and any shared libraries, then // execute commands where you can look at types in data sections. - const SectionLoadList &target_sections = target->GetSectionLoadList(); - if (!target_sections.IsEmpty()) { + if (target->HasLoadedSections()) { address = m_value.ULongLong(LLDB_INVALID_ADDRESS); - if (target_sections.ResolveLoadAddress(address, file_so_addr)) { + if (target->ResolveLoadAddress(address, file_so_addr)) { address_type = eAddressTypeLoad; data.SetByteOrder(target->GetArchitecture().GetByteOrder()); data.SetAddressByteSize( diff --git a/lldb/source/DataFormatters/CXXFunctionPointer.cpp b/lldb/source/DataFormatters/CXXFunctionPointer.cpp index 6d56e39fa9733..e17659ad0f2f0 100644 --- a/lldb/source/DataFormatters/CXXFunctionPointer.cpp +++ b/lldb/source/DataFormatters/CXXFunctionPointer.cpp @@ -39,9 +39,8 @@ bool lldb_private::formatters::CXXFunctionPointerSummaryProvider( Address so_addr; Target *target = exe_ctx.GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - target->GetSectionLoadList().ResolveLoadAddress(func_ptr_address, - so_addr); + if (target && target->HasLoadedSections()) { + target->ResolveLoadAddress(func_ptr_address, so_addr); if (so_addr.GetSection() == nullptr) { // If we have an address that doesn't correspond to any symbol, // it might have authentication bits. Strip them & see if it diff --git a/lldb/source/Expression/ObjectFileJIT.cpp b/lldb/source/Expression/ObjectFileJIT.cpp index 9a839866096bd..e4a613551d22e 100644 --- a/lldb/source/Expression/ObjectFileJIT.cpp +++ b/lldb/source/Expression/ObjectFileJIT.cpp @@ -178,8 +178,8 @@ bool ObjectFileJIT::SetLoadAddress(Target &target, lldb::addr_t value, SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp && section_sp->GetFileSize() > 0 && !section_sp->IsThreadSpecific()) { - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_sp->GetFileAddress() + value)) + if (target.SetSectionLoadAddress(section_sp, + section_sp->GetFileAddress() + value)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp index 5aa903443c760..3748be0533ad7 100644 --- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp +++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp @@ -76,8 +76,7 @@ lldb::addr_t ArchitectureMips::GetBreakableLoadAddress(lldb::addr_t addr, Address resolved_addr; - SectionLoadList §ion_load_list = target.GetSectionLoadList(); - if (section_load_list.IsEmpty()) + if (!target.HasLoadedSections()) // No sections are loaded, so we must assume we are not running yet and // need to operate only on file address. target.ResolveFileAddress(addr, resolved_addr); diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp index 81c122146764d..76f2db086476f 100644 --- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp +++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp @@ -1787,9 +1787,9 @@ const char *DisassemblerLLVMC::SymbolLookup(uint64_t value, uint64_t *type_ptr, module_sp->ResolveFileAddress(value, value_so_addr); module_sp->ResolveFileAddress(pc, pc_so_addr); } - } else if (target && !target->GetSectionLoadList().IsEmpty()) { - target->GetSectionLoadList().ResolveLoadAddress(value, value_so_addr); - target->GetSectionLoadList().ResolveLoadAddress(pc, pc_so_addr); + } else if (target && target->HasLoadedSections()) { + target->ResolveLoadAddress(value, value_so_addr); + target->ResolveLoadAddress(pc, pc_so_addr); } SymbolContext sym_ctx; diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp index 82555d1e028b4..5b11059bcc50c 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp @@ -368,7 +368,7 @@ bool DynamicLoaderMacOS::NotifyBreakpointHit(void *baton, dyld_instance->UnloadAllImages(); dyld_instance->ClearDYLDModule(); process->GetTarget().GetImages().Clear(); - process->GetTarget().GetSectionLoadList().Clear(); + process->GetTarget().ClearSectionLoadList(); addr_t all_image_infos = process->GetImageInfoAddress(); int addr_size = diff --git a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp index e8b92373ef0fa..643c9653f26ec 100644 --- a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp +++ b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp @@ -103,8 +103,8 @@ void DynamicLoaderStatic::LoadAllImagesAtFileAddresses() { for (size_t sect_idx = 0; sect_idx < num_sections; ++sect_idx) { SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp) { - if (target.GetSectionLoadList().GetSectionLoadAddress( - section_sp) != LLDB_INVALID_ADDRESS) { + if (target.GetSectionLoadAddress(section_sp) != + LLDB_INVALID_ADDRESS) { no_load_addresses = false; break; } diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp index 6d3e5b7e5573c..70e36801c3fd7 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp @@ -546,8 +546,7 @@ static std::string Sprintf(const char *format, ...) { static std::string GetSymbolNameFromAddress(ProcessSP process_sp, addr_t addr) { lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) return ""; lldb_private::Symbol *symbol = so_addr.CalculateSymbolContextSymbol(); @@ -561,8 +560,7 @@ static std::string GetSymbolNameFromAddress(ProcessSP process_sp, addr_t addr) { static void GetSymbolDeclarationFromAddress(ProcessSP process_sp, addr_t addr, Declaration &decl) { lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) return; lldb_private::Symbol *symbol = so_addr.CalculateSymbolContextSymbol(); @@ -600,8 +598,7 @@ addr_t InstrumentationRuntimeTSan::GetFirstNonInternalFramePc( addr_t addr = *maybe_addr; lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress( - addr, so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) continue; if (so_addr.GetModule() == runtime_module_sp) diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp index 1688fb27430a7..b6487d4e8ed4b 100644 --- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp +++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp @@ -377,7 +377,7 @@ bool JITLoaderGDB::ReadJITDescriptorImpl(bool all_entries) { for (uint32_t i = 0; i < num_sections; ++i) { SectionSP section_sp(section_list->GetSectionAtIndex(i)); if (section_sp) { - target.GetSectionLoadList().SetSectionUnloaded(section_sp); + target.SetSectionUnloaded(section_sp); } } } diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index e7ca3f655f237..fb706544ea560 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -266,21 +266,20 @@ CPPLanguageRuntime::FindLibCppStdFunctionCallableInfo( Target &target = process->GetTarget(); - if (target.GetSectionLoadList().IsEmpty()) + if (!target.HasLoadedSections()) return optional_info; Address vtable_first_entry_resolved; - if (!target.GetSectionLoadList().ResolveLoadAddress( - vtable_address_first_entry, vtable_first_entry_resolved)) + if (!target.ResolveLoadAddress(vtable_address_first_entry, + vtable_first_entry_resolved)) return optional_info; Address vtable_addr_resolved; SymbolContext sc; Symbol *symbol = nullptr; - if (!target.GetSectionLoadList().ResolveLoadAddress(vtable_address, - vtable_addr_resolved)) + if (!target.ResolveLoadAddress(vtable_address, vtable_addr_resolved)) return optional_info; target.GetImages().ResolveSymbolContextForAddress( @@ -322,8 +321,8 @@ CPPLanguageRuntime::FindLibCppStdFunctionCallableInfo( // Setup for cases 2, 4 and 5 we have a pointer to a function after the // vtable. We will use a process of elimination to drop through each case // and obtain the data we need. - if (target.GetSectionLoadList().ResolveLoadAddress( - possible_function_address, function_address_resolved)) { + if (target.ResolveLoadAddress(possible_function_address, + function_address_resolved)) { target.GetImages().ResolveSymbolContextForAddress( function_address_resolved, eSymbolContextEverything, sc); symbol = sc.symbol; @@ -418,15 +417,14 @@ CPPLanguageRuntime::GetStepThroughTrampolinePlan(Thread &thread, TargetSP target_sp(thread.CalculateTarget()); - if (target_sp->GetSectionLoadList().IsEmpty()) + if (!target_sp->HasLoadedSections()) return ret_plan_sp; Address pc_addr_resolved; SymbolContext sc; Symbol *symbol; - if (!target_sp->GetSectionLoadList().ResolveLoadAddress(curr_pc, - pc_addr_resolved)) + if (!target_sp->ResolveLoadAddress(curr_pc, pc_addr_resolved)) return ret_plan_sp; target_sp->GetImages().ResolveSymbolContextForAddress( diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 6452baa4f84af..13e1198516f78 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -762,8 +762,7 @@ bool ObjectFileELF::SetLoadAddress(Target &target, lldb::addr_t value, if (GetAddressByteSize() == 4) load_addr &= 0xFFFFFFFF; - if (target.GetSectionLoadList().SetSectionLoadAddress(section_sp, - load_addr)) + if (target.SetSectionLoadAddress(section_sp, load_addr)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 488c9bd1e54af..bf2d293d2012c 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6253,9 +6253,9 @@ bool ObjectFileMachO::SetLoadAddress(Target &target, lldb::addr_t value, "0x%" PRIx64, section_sp->GetName().AsCString(), section_sp->GetFileAddress() + value); - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_sp->GetFileAddress() + value, - warn_multiple)) + if (target.SetSectionLoadAddress(section_sp, + section_sp->GetFileAddress() + value, + warn_multiple)) ++num_loaded_sections; } } @@ -6276,8 +6276,8 @@ bool ObjectFileMachO::SetLoadAddress(Target &target, lldb::addr_t value, "ObjectFileMachO::SetLoadAddress segment '%s' load addr is " "0x%" PRIx64, section_sp->GetName().AsCString(), section_load_addr); - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_load_addr, warn_multiple)) + if (target.SetSectionLoadAddress(section_sp, section_load_addr, + warn_multiple)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index bfdb8140e40af..6d92a204b86cc 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -482,7 +482,7 @@ bool ObjectFilePECOFF::SetLoadAddress(Target &target, addr_t value, // that have SHF_ALLOC in their flag bits. SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp && !section_sp->IsThreadSpecific()) { - if (target.GetSectionLoadList().SetSectionLoadAddress( + if (target.SetSectionLoadAddress( section_sp, section_sp->GetFileAddress() + value)) ++num_loaded_sections; } diff --git a/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp b/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp index ec1f3f61892d3..e8745d6dd6b83 100644 --- a/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp +++ b/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp @@ -59,8 +59,7 @@ bool ObjectFilePlaceholder::SetLoadAddress(Target &target, addr_t value, GetModule()->GetSectionList(); assert(m_sections_up->GetNumSections(0) == 1); - target.GetSectionLoadList().SetSectionLoadAddress( - m_sections_up->GetSectionAtIndex(0), m_base); + target.SetSectionLoadAddress(m_sections_up->GetSectionAtIndex(0), m_base); return true; } diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index 05b3bb9f54f9c..ef3c00e2857df 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -383,12 +383,12 @@ void ProcessMinidump::BuildMemoryRegions() { MemoryRegionInfos to_add; ModuleList &modules = GetTarget().GetImages(); - SectionLoadList &load_list = GetTarget().GetSectionLoadList(); + Target &target = GetTarget(); modules.ForEach([&](const ModuleSP &module_sp) { SectionList *sections = module_sp->GetSectionList(); for (size_t i = 0; i < sections->GetSize(); ++i) { SectionSP section_sp = sections->GetSectionAtIndex(i); - addr_t load_addr = load_list.GetSectionLoadAddress(section_sp); + addr_t load_addr = target.GetSectionLoadAddress(section_sp); if (load_addr == LLDB_INVALID_ADDRESS) continue; MemoryRegionInfo::RangeType section_range(load_addr, diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp index a09bb372bb01c..3b1535a931999 100644 --- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp +++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp @@ -263,8 +263,7 @@ BuildModulesSection(Process &process, FileSpec directory) { lldb::addr_t load_addr = LLDB_INVALID_ADDRESS; Address base_addr(objfile->GetBaseAddress()); - if (base_addr.IsValid() && - !process.GetTarget().GetSectionLoadList().IsEmpty()) + if (base_addr.IsValid() && process.GetTarget().HasLoadedSections()) load_addr = base_addr.GetLoadAddress(&process.GetTarget()); if (load_addr == LLDB_INVALID_ADDRESS) diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp index d3881f8ccf7fe..264acad050e35 100644 --- a/lldb/source/Symbol/ObjectFile.cpp +++ b/lldb/source/Symbol/ObjectFile.cpp @@ -646,8 +646,7 @@ ObjectFile::GetLoadableData(Target &target) { for (size_t i = 0; i < section_count; ++i) { LoadableData loadable; SectionSP section_sp = section_list->GetSectionAtIndex(i); - loadable.Dest = - target.GetSectionLoadList().GetSectionLoadAddress(section_sp); + loadable.Dest = target.GetSectionLoadAddress(section_sp); if (loadable.Dest == LLDB_INVALID_ADDRESS) continue; // We can skip sections like bss diff --git a/lldb/source/Target/ProcessTrace.cpp b/lldb/source/Target/ProcessTrace.cpp index 4718a7ca50a7c..f131339905474 100644 --- a/lldb/source/Target/ProcessTrace.cpp +++ b/lldb/source/Target/ProcessTrace.cpp @@ -123,7 +123,7 @@ bool ProcessTrace::GetProcessInfo(ProcessInstanceInfo &info) { size_t ProcessTrace::DoReadMemory(addr_t addr, void *buf, size_t size, Status &error) { Address resolved_address; - GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, resolved_address); + GetTarget().ResolveLoadAddress(addr, resolved_address); return GetTarget().ReadMemoryFromFileCache(resolved_address, buf, size, error); diff --git a/lldb/source/Target/SectionLoadHistory.cpp b/lldb/source/Target/SectionLoadHistory.cpp index f329d425e34b2..99797b1d1abc5 100644 --- a/lldb/source/Target/SectionLoadHistory.cpp +++ b/lldb/source/Target/SectionLoadHistory.cpp @@ -112,13 +112,15 @@ SectionLoadHistory::GetSectionLoadAddress(uint32_t stop_id, } bool SectionLoadHistory::ResolveLoadAddress(uint32_t stop_id, addr_t load_addr, - Address &so_addr) { + Address &so_addr, + bool allow_section_end) { // First find the top level section that this load address exists in std::lock_guard guard(m_mutex); const bool read_only = true; SectionLoadList *section_load_list = GetSectionLoadListForStopID(stop_id, read_only); - return section_load_list->ResolveLoadAddress(load_addr, so_addr); + return section_load_list->ResolveLoadAddress(load_addr, so_addr, + allow_section_end); } bool SectionLoadHistory::SetSectionLoadAddress( diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 46216ba2d566d..8d77097477651 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -270,12 +270,18 @@ void Target::DeleteCurrentProcess() { if (m_process_sp) { // We dispose any active tracing sessions on the current process m_trace_sp.reset(); - m_section_load_history.Clear(); + if (m_process_sp->IsAlive()) m_process_sp->Destroy(false); m_process_sp->Finalize(false /* not destructing */); + // Let the process finalize itself first, then clear the section load + // history. Some objects owned by the process might end up calling + // SectionLoadHistory::SetSectionUnloaded() which can create entries in + // the section load history that can mess up subsequent processes. + m_section_load_history.Clear(); + CleanupProcess(); m_process_sp.reset(); @@ -3217,8 +3223,9 @@ Status Target::Install(ProcessLaunchInfo *launch_info) { } bool Target::ResolveLoadAddress(addr_t load_addr, Address &so_addr, - uint32_t stop_id) { - return m_section_load_history.ResolveLoadAddress(stop_id, load_addr, so_addr); + uint32_t stop_id, bool allow_section_end) { + return m_section_load_history.ResolveLoadAddress(stop_id, load_addr, so_addr, + allow_section_end); } bool Target::ResolveFileAddress(lldb::addr_t file_addr, @@ -5147,3 +5154,15 @@ Target::ReportStatistics(const lldb_private::StatisticsOptions &options) { } void Target::ResetStatistics() { m_stats.Reset(*this); } + +bool Target::HasLoadedSections() { return !GetSectionLoadList().IsEmpty(); } + +lldb::addr_t Target::GetSectionLoadAddress(const lldb::SectionSP §ion_sp) { + return GetSectionLoadList().GetSectionLoadAddress(section_sp); +} + +void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); } + +void Target::DumpSectionLoadList(Stream &s) { + GetSectionLoadList().Dump(s, this); +} diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp index 4a2ede8b39728..109d1b6b3435b 100644 --- a/lldb/source/Target/ThreadPlanStepInRange.cpp +++ b/lldb/source/Target/ThreadPlanStepInRange.cpp @@ -263,8 +263,7 @@ bool ThreadPlanStepInRange::ShouldStop(Event *event_ptr) { const Architecture *arch = GetTarget().GetArchitecturePlugin(); if (arch) { Address curr_sec_addr; - GetTarget().GetSectionLoadList().ResolveLoadAddress(curr_addr, - curr_sec_addr); + GetTarget().ResolveLoadAddress(curr_addr, curr_sec_addr); bytes_to_skip = arch->GetBytesToSkip(*sc.symbol, curr_sec_addr); } } diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index ff9f49c6d4bb6..356ce379c2993 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -140,8 +140,7 @@ void ThreadPlanAssemblyTracer::Log() { Address pc_addr; bool addr_valid = false; uint8_t buffer[16] = {0}; // Must be big enough for any single instruction - addr_valid = m_process.GetTarget().GetSectionLoadList().ResolveLoadAddress( - pc, pc_addr); + addr_valid = m_process.GetTarget().ResolveLoadAddress(pc, pc_addr); pc_addr.Dump(stream, &GetThread(), Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); From 273a94b3d5a78cd9122c7b3bbb5d5a87147735d2 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Tue, 14 Jan 2025 21:28:29 -0800 Subject: [PATCH 30/82] [NVPTX] Add some more immediate instruction variants (#122746) While this likely won't impact the final SASS, it makes for more compact PTX. --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 127 ++++++++++--------- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 13 -- llvm/test/CodeGen/NVPTX/arithmetic-int.ll | 23 ++++ llvm/test/CodeGen/NVPTX/fma.ll | 14 ++ llvm/test/CodeGen/NVPTX/i128.ll | 148 ++++++++++------------ llvm/test/CodeGen/NVPTX/shift-parts.ll | 1 - 6 files changed, 175 insertions(+), 151 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6a95d9ebef6c7..f8dc66d598025 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -207,33 +207,39 @@ class ValueToRegClass { // Some Common Instruction Class Templates //===----------------------------------------------------------------------===// +// Utility class to wrap up information about a register and DAG type for more +// convenient iteration and parameterization +class RegTyInfo { + ValueType Ty = ty; + NVPTXRegClass RC = rc; + Operand Imm = imm; + int Size = ty.Size; +} + +def I16RT : RegTyInfo; +def I32RT : RegTyInfo; +def I64RT : RegTyInfo; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). -multiclass I3 { - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def i16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; - def i16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, (imm):$b))]>; +multiclass I3 { + foreach t = [I16RT, I32RT, I64RT] in { + defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; + + def t.Ty # rr : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; + def t.Ty # ri : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; + if !not(commutative) then + def t.Ty # ir : + NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; + } } class I16x2 : @@ -870,8 +876,8 @@ defm SUB_i1 : ADD_SUB_i1; // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we // also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add>; -defm SUB : I3<"sub.s", sub>; +defm ADD : I3<"add.s", add, /*commutative=*/ true>; +defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; def ADD16x2 : I16x2<"add.s", add>; @@ -883,18 +889,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; -defm MULT : I3<"mul.lo.s", mul>; +defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; -defm MULTHS : I3<"mul.hi.s", mulhs>; -defm MULTHU : I3<"mul.hi.u", mulhu>; +defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; +defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; -defm SDIV : I3<"div.s", sdiv>; -defm UDIV : I3<"div.u", udiv>; +defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; +defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM // will lower it. -defm SREM : I3<"rem.s", srem>; -defm UREM : I3<"rem.u", urem>; +defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; +defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; // Integer absolute value. NumBits should be one minus the bit width of RC. // This idiom implements the algorithm at @@ -909,10 +915,10 @@ defm ABS_32 : ABS; defm ABS_64 : ABS; // Integer min/max. -defm SMAX : I3<"max.s", smax>; -defm UMAX : I3<"max.u", umax>; -defm SMIN : I3<"min.s", smin>; -defm UMIN : I3<"min.u", umin>; +defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; +defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; +defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; +defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; def SMAX16x2 : I16x2<"max.s", smax>; def UMAX16x2 : I16x2<"max.u", umax>; @@ -1392,25 +1398,32 @@ def FDIV32ri_prec : // multiclass FMA { - def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; + defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;"; + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; + def iir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + } multiclass FMA_F16 { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4f144cc641080..2d6ee2e28b4df 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -6,19 +6,6 @@ // //===----------------------------------------------------------------------===// -// Utility class to wrap up information about a register and DAG type for more -// convenient iteration and parameterization -class RegTyInfo { - ValueType Ty = ty; - NVPTXRegClass RC = rc; - Operand Imm = imm; - int Size = ty.Size; -} - -def I32RT : RegTyInfo; -def I64RT : RegTyInfo; - - def immFloat0 : PatLeaf<(fpimm), [{ float f = (float)N->getValueAPF().convertToFloat(); return (f==0.0f); diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll index dc710a5c288a7..1fbfd0a987d7a 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll @@ -317,3 +317,26 @@ define i16 @lshr_i16(i16 %a, i16 %b) { %ret = lshr i16 %a, %b ret i16 %ret } + +;; Immediate cases + +define i16 @srem_i16_ir(i16 %a) { +; CHECK: rem.s16 %rs{{[0-9]+}}, 12, %rs{{[0-9]+}} +; CHECK: ret + %ret = srem i16 12, %a + ret i16 %ret +} + +define i32 @udiv_i32_ir(i32 %a) { +; CHECK: div.u32 %r{{[0-9]+}}, 34, %r{{[0-9]+}} +; CHECK: ret + %ret = udiv i32 34, %a + ret i32 %ret +} + +define i64 @sub_i64_ir(i64 %a) { +; CHECK: sub.s64 %rd{{[0-9]+}}, 56, %rd{{[0-9]+}} +; CHECK: ret + %ret = sub i64 56, %a + ret i64 %ret +} diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll index 69ee6167a4d3e..3416420367beb 100644 --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -41,3 +41,17 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { %d = call double @dummy_f64(double %b, double %c) ret double %d } + +define ptx_device float @f32_iir(float %x) { +; CHECK: fma.rn.f32 %f{{[0-9]+}}, 0f52E8D4A5, 0f4A52FC54, %f{{[0-9]+}}; +; CHECK: ret; + %r = call float @llvm.fma.f32(float 499999997952.0, float 3456789.0, float %x) + ret float %r +} + +define ptx_device float @f32_iii(float %x) { +; CHECK: mov.f32 %f{{[0-9]+}}, 0f41200000; +; CHECK: ret; + %r = call float @llvm.fma.f32(float 2.0, float 3.0, float 4.0) + ret float %r +} diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index accfbe4af0313..7ece0ccbd844e 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -6,7 +6,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<129>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -67,32 +67,29 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd72, %rd121, %rd122; ; CHECK-NEXT: setp.eq.s64 %p15, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd73, %rd4, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd74, %rd3, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd73, %rd4, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r11; ; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd76, %rd3, %r15; -; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd76, %rd3, %r12; +; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63; ; CHECK-NEXT: selp.b64 %rd126, %rd76, %rd75, %p16; -; CHECK-NEXT: shl.b64 %rd125, %rd3, %r11; +; CHECK-NEXT: shl.b64 %rd125, %rd3, %r10; ; CHECK-NEXT: mov.u64 %rd116, %rd119; ; CHECK-NEXT: @%p15 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd121; -; CHECK-NEXT: shr.u64 %rd79, %rd3, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd80, %rd4, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd121; +; CHECK-NEXT: shr.u64 %rd79, %rd3, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd80, %rd4, %r14; ; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd4, %r19; -; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd82, %rd4, %r15; +; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63; ; CHECK-NEXT: selp.b64 %rd123, %rd82, %rd81, %p17; -; CHECK-NEXT: shr.u64 %rd124, %rd4, %r16; +; CHECK-NEXT: shr.u64 %rd124, %rd4, %r13; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; ; CHECK-NEXT: mov.b64 %rd116, 0; @@ -155,7 +152,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: urem_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<115>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -205,32 +202,29 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd62, %rd107, %rd108; ; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11; ; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; -; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12; +; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63; ; CHECK-NEXT: selp.b64 %rd112, %rd66, %rd65, %p14; -; CHECK-NEXT: shl.b64 %rd111, %rd41, %r11; +; CHECK-NEXT: shl.b64 %rd111, %rd41, %r10; ; CHECK-NEXT: mov.u64 %rd102, %rd105; ; CHECK-NEXT: @%p13 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd107; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd107; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14; ; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; -; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15; +; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63; ; CHECK-NEXT: selp.b64 %rd109, %rd72, %rd71, %p15; -; CHECK-NEXT: shr.u64 %rd110, %rd42, %r16; +; CHECK-NEXT: shr.u64 %rd110, %rd42, %r13; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; ; CHECK-NEXT: mov.b64 %rd102, 0; @@ -324,7 +318,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: sdiv_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -386,32 +380,29 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; ; CHECK-NEXT: setp.eq.s64 %p15, %rd73, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd67; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd74, %rd2, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd75, %rd1, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd74, %rd2, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd75, %rd1, %r11; ; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd77, %rd1, %r15; -; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd77, %rd1, %r12; +; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63; ; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p16; -; CHECK-NEXT: shl.b64 %rd118, %rd1, %r11; +; CHECK-NEXT: shl.b64 %rd118, %rd1, %r10; ; CHECK-NEXT: mov.u64 %rd109, %rd112; ; CHECK-NEXT: @%p15 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd114; -; CHECK-NEXT: shr.u64 %rd80, %rd1, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd81, %rd2, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd114; +; CHECK-NEXT: shr.u64 %rd80, %rd1, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd81, %rd2, %r14; ; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd83, %rd2, %r19; -; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd83, %rd2, %r15; +; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63; ; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p17; -; CHECK-NEXT: shr.u64 %rd117, %rd2, %r16; +; CHECK-NEXT: shr.u64 %rd117, %rd2, %r13; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; ; CHECK-NEXT: mov.b64 %rd109, 0; @@ -466,7 +457,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: udiv_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -516,32 +507,29 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; ; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11; ; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; -; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12; +; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63; ; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p14; -; CHECK-NEXT: shl.b64 %rd103, %rd41, %r11; +; CHECK-NEXT: shl.b64 %rd103, %rd41, %r10; ; CHECK-NEXT: mov.u64 %rd94, %rd97; ; CHECK-NEXT: @%p13 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd99; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd99; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14; ; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; -; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15; +; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63; ; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p15; -; CHECK-NEXT: shr.u64 %rd102, %rd42, %r16; +; CHECK-NEXT: shr.u64 %rd102, %rd42, %r13; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; ; CHECK-NEXT: mov.b64 %rd94, 0; diff --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll index c7cfdc4ff2a4d..ded1046714fd5 100644 --- a/llvm/test/CodeGen/NVPTX/shift-parts.ll +++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll @@ -4,7 +4,6 @@ ; CHECK: shift_parts_left_128 define void @shift_parts_left_128(ptr %val, ptr %amtptr) { ; CHECK: shl.b64 -; CHECK: mov.b32 ; CHECK: sub.s32 ; CHECK: shr.u64 ; CHECK: or.b64 From 02403f4e450b86d93197dd34045ff40a34b21494 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 15 Jan 2025 13:42:21 +0800 Subject: [PATCH 31/82] [RISCV] Split strided-load-store.ll tests into EVL and VP. NFC None of the changes in #122232 or the upcoming #122244 are specific to the EVL, so split out the EVL tail-folded loops into separate "integration tests" that reflect the output of the loop vectorizer. --- .../CodeGen/RISCV/rvv/strided-load-store.ll | 150 +++++++++++++----- 1 file changed, 111 insertions(+), 39 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index 7c1fab9bfe91a..f777c450bc106 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -400,33 +400,21 @@ declare void @llvm.masked.scatter.nxv1i64.nxv1p0(, @llvm.masked.gather.nxv1i64.nxv1p0(, i32, , ) -; TODO: Make the step loop variant to reflect what the loop vectorizer will emit -; in an EVL tail folding configuration. - define @vp_gather(ptr %a, i32 %len) { ; CHECK-LABEL: @vp_gather( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] -; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, splat (i1 true), i32 42) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -444,15 +432,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] - - %elems = sub i64 %wide.trip.count, %index - %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) - - %odd = and %vec.ind, splat (i64 1) - %mask = icmp ne %odd, splat (i64 0) - %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 - %gather = call @llvm.vp.gather( %2, %mask, i32 %evl) + %gather = call @llvm.vp.gather( %2, splat (i1 true), i32 42) %accum.next = add %accum, %gather %index.next = add nuw i64 %index, %0 %vec.ind.next = add %vec.ind, %.splat @@ -463,31 +444,19 @@ for.cond.cleanup: ; preds = %vector.body ret %accum.next } -; TODO: Make the step loop variant to reflect what the loop vectorizer will emit -; in an EVL tail folding configuration. - define void @vp_scatter(ptr %a, i32 %len) { ; CHECK-LABEL: @vp_scatter( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] -; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 -; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP2]], i64 16, splat (i1 true), i32 42) ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -504,17 +473,120 @@ vector.ph: vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + tail call void @llvm.vp.scatter( zeroinitializer, %2, splat (i1 true), i32 42) + %index.next = add nuw i64 %index, %0 + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; Test that reflects what the loop vectorizer will generate for an EVL tail +; folded loop + +define @evl_gather(ptr %a, i32 %len) { +; CHECK-LABEL: @evl_gather( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.gather.nxv1i64.nxv1p0( [[TMP2]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[EVL_ZEXT]] +; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 +; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] %elems = sub i64 %wide.trip.count, %index %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) - %odd = and %vec.ind, splat (i64 1) - %mask = icmp ne %odd, splat (i64 0) + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + %gather = call @llvm.vp.gather( %2, splat (i1 true), i32 %evl) + %accum.next = add %accum, %gather + + %evl.zext = zext i32 %evl to i64 + %index.next = add nuw i64 %index, %evl.zext + %evl.splatinsert = insertelement poison, i64 %evl.zext, i64 0 + %evl.splat = shufflevector %evl.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %evl.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + +; Test that reflects what the loop vectorizer will generate for an EVL tail +; folded loop + +define void @evl_scatter(ptr %a, i32 %len) { +; CHECK-LABEL: @evl_scatter( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: tail call void @llvm.vp.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] +; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 +; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + + %elems = sub i64 %wide.trip.count, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 - tail call void @llvm.vp.scatter( zeroinitializer, %2, %mask, i32 %evl) - %index.next = add nuw i64 %index, %0 - %vec.ind.next = add %vec.ind, %.splat + tail call void @llvm.vp.scatter( zeroinitializer, %2, splat (i1 true), i32 %evl) + + %evl.zext = zext i32 %evl to i64 + %index.next = add nuw i64 %index, %evl.zext + %evl.splatinsert = insertelement poison, i64 %evl.zext, i64 0 + %evl.splat = shufflevector %evl.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %evl.splat %3 = icmp ne i64 %index.next, %wide.trip.count br i1 %3, label %for.cond.cleanup, label %vector.body From 030d48b7db9845f42bf3ef365193bfdbb23f5440 Mon Sep 17 00:00:00 2001 From: Bevin Hansson Date: Wed, 15 Jan 2025 07:14:20 +0100 Subject: [PATCH 32/82] [clangd] Augment code completion results with documentation from the index. (#120099) When looking up code completions from Sema, there is no associated documentation. This is due to crash issues with stale preambles. However, this also means that code completion results from other than the main file do not have documentation in certain cases, which is a bad user experience. This patch performs a lookup into the index using the code completion result declarations to find documentation, and attaches it to the results. Fixes clangd/clangd#2252 Fixes clangd/clangd#564 --- clang-tools-extra/clangd/CodeComplete.cpp | 27 +++++++ .../clangd/unittests/CodeCompleteTests.cpp | 81 +++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index fb39b7b292242..a8182ce98ebe0 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1867,14 +1867,41 @@ class CodeCompleteFlow { CodeCompleteResult Output; // Convert the results to final form, assembling the expensive strings. + // If necessary, search the index for documentation comments. + LookupRequest Req; + llvm::DenseMap SymbolToCompletion; for (auto &C : Scored) { Output.Completions.push_back(toCodeCompletion(C.first)); Output.Completions.back().Score = C.second; Output.Completions.back().CompletionTokenRange = ReplacedRange; + if (Opts.Index && !Output.Completions.back().Documentation) { + for (auto &Cand : C.first) { + if (Cand.SemaResult && + Cand.SemaResult->Kind == CodeCompletionResult::RK_Declaration) { + auto ID = clangd::getSymbolID(Cand.SemaResult->getDeclaration()); + if (!ID) + continue; + Req.IDs.insert(ID); + SymbolToCompletion[ID] = Output.Completions.size() - 1; + } + } + } } Output.HasMore = Incomplete; Output.Context = CCContextKind; Output.CompletionRange = ReplacedRange; + + // Look up documentation from the index. + if (Opts.Index) { + Opts.Index->lookup(Req, [&](const Symbol &S) { + if (S.Documentation.empty()) + return; + auto &C = Output.Completions[SymbolToCompletion.at(S.ID)]; + C.Documentation.emplace(); + parseDocumentation(S.Documentation, *C.Documentation); + }); + } + return Output; } diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 9d48a6e09fc77..b12f8275b8a26 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -1136,6 +1136,87 @@ int x = foo^ Contains(AllOf(named("foo"), doc("This comment should be retained!")))); } +TEST(CompletionTest, CommentsOnMembersFromHeader) { + MockFS FS; + MockCompilationDatabase CDB; + + auto Opts = ClangdServer::optsForTest(); + Opts.BuildDynamicSymbolIndex = true; + + ClangdServer Server(CDB, FS, Opts); + + FS.Files[testPath("foo.h")] = R"cpp( + struct alpha { + /// This is a member field. + int gamma; + + /// This is a member function. + int delta(); + }; + )cpp"; + + auto File = testPath("foo.cpp"); + Annotations Test(R"cpp( +#include "foo.h" +alpha a; +int x = a.^ + )cpp"); + runAddDocument(Server, File, Test.code()); + auto CompletionList = + llvm::cantFail(runCodeComplete(Server, File, Test.point(), {})); + + EXPECT_THAT(CompletionList.Completions, + Contains(AllOf(named("gamma"), doc("This is a member field.")))); + EXPECT_THAT( + CompletionList.Completions, + Contains(AllOf(named("delta"), doc("This is a member function.")))); +} + +TEST(CompletionTest, CommentsOnMembersFromHeaderOverloadBundling) { + using testing::AnyOf; + MockFS FS; + MockCompilationDatabase CDB; + + auto Opts = ClangdServer::optsForTest(); + Opts.BuildDynamicSymbolIndex = true; + + ClangdServer Server(CDB, FS, Opts); + + FS.Files[testPath("foo.h")] = R"cpp( + struct alpha { + /// bool overload. + int delta(bool b); + + /// int overload. + int delta(int i); + + void epsilon(long l); + + /// This one has a comment. + void epsilon(int i); + }; + )cpp"; + + auto File = testPath("foo.cpp"); + Annotations Test(R"cpp( +#include "foo.h" +alpha a; +int x = a.^ + )cpp"); + runAddDocument(Server, File, Test.code()); + clangd::CodeCompleteOptions CCOpts; + CCOpts.BundleOverloads = true; + auto CompletionList = + llvm::cantFail(runCodeComplete(Server, File, Test.point(), CCOpts)); + + EXPECT_THAT( + CompletionList.Completions, + Contains(AllOf(named("epsilon"), doc("This one has a comment.")))); + EXPECT_THAT(CompletionList.Completions, + Contains(AllOf(named("delta"), AnyOf(doc("bool overload."), + doc("int overload."))))); +} + TEST(CompletionTest, GlobalCompletionFiltering) { Symbol Class = cls("XYZ"); From 2504693d75c6ed1047955dd6e65ce9d4c1a164c8 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 15 Jan 2025 14:25:36 +0800 Subject: [PATCH 33/82] [ARM][NFC] Remove redundant sub-expressions (#122911) This PR removes redundant sub-expressions `Mnemonic != "vqmovnt"`, which is mentioned in https://pvs-studio.com/en/blog/posts/cpp/1188/. --- llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 024a64aceedbd..dad91c6a969e8 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6663,9 +6663,9 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, StringRef ExtraToken, Mnemonic != "vshllt" && Mnemonic != "vrshrnt" && Mnemonic != "vshrnt" && Mnemonic != "vqrshrunt" && Mnemonic != "vqshrunt" && Mnemonic != "vqrshrnt" && Mnemonic != "vqshrnt" && Mnemonic != "vmullt" && - Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" && - Mnemonic != "vqmovnt" && Mnemonic != "vmovnt" && Mnemonic != "vqdmullt" && - Mnemonic != "vpnot" && Mnemonic != "vcvtt" && Mnemonic != "vcvt") { + Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" && Mnemonic != "vmovnt" && + Mnemonic != "vqdmullt" && Mnemonic != "vpnot" && Mnemonic != "vcvtt" && + Mnemonic != "vcvt") { unsigned VCC = ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size() - 1)); if (VCC != ~0U) { From 135f39c780ac106ac93ef0838a62d7ebf947c2a0 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Wed, 15 Jan 2025 01:26:14 -0500 Subject: [PATCH 34/82] [SPIR-V] Fix a typo in cmake. NFC - It should be '${SPIRV_AS}' that is linked or copied to 'spirv-as', instead of '${SPIRV_VAL}' --- llvm/tools/spirv-tools/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/spirv-tools/CMakeLists.txt b/llvm/tools/spirv-tools/CMakeLists.txt index 57dfe3310c459..a47c1f894d7a4 100644 --- a/llvm/tools/spirv-tools/CMakeLists.txt +++ b/llvm/tools/spirv-tools/CMakeLists.txt @@ -66,7 +66,7 @@ endif () if (SPIRV_AS) add_custom_target(spirv-as - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_VAL}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_AS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as") else () add_custom_target(spirv-as COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" From edc02351dd11cc4a39b7c541b26b71c6f36c8e55 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 15 Jan 2025 06:29:06 +0000 Subject: [PATCH 35/82] [NFC][LoopVectorize] Add more loop early exit asserts (#122732) This patch is split off #120567, adding asserts in addScalarResumePhis and addExitUsersForFirstOrderRecurrences that the loop does not contain an uncountable early exit, since the code cannot yet handle them correctly. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fe2fb5e9faaea..99f6a8860f0f4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9082,8 +9082,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast(ScalarPH->getSinglePredecessor()); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); VPBuilder VectorPHBuilder( - cast(Plan.getVectorLoopRegion()->getSinglePredecessor())); + cast(VectorRegion->getSinglePredecessor())); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); VPBuilder ScalarPHBuilder(ScalarPH); VPValue *OneVPV = Plan.getOrAddLiveIn( @@ -9115,6 +9116,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { // start value provides the value if the loop is bypassed. bool IsFOR = isa(VectorPhiR); auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); if (IsFOR) ResumeFromVectorLoop = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, @@ -9284,6 +9287,9 @@ static void addExitUsersForFirstOrderRecurrences( if (!FOR) continue; + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); + // This is the second phase of vectorizing first-order recurrences, creating // extract for users outside the loop. An overview of the transformation is // described below. Suppose we have the following loop with some use after From 7201cae106260aeb3e9bbbb7d5291ff30f05076a Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Wed, 15 Jan 2025 15:15:35 +0800 Subject: [PATCH 36/82] [C++20] [Modules] Support module level lookup (#122887) Close https://github.com/llvm/llvm-project/issues/90154 This patch is also an optimization to the lookup process to utilize the information provided by `export` keyword. Previously, in the lookup process, the `export` keyword only takes part in the check part, it doesn't get involved in the lookup process. That said, previously, in a name lookup for 'name', we would load all of declarations with the name 'name' and check if these declarations are valid or not. It works well. But it is inefficient since it may load declarations that may not be wanted. Note that this patch actually did a trick in the lookup process instead of bring module information to DeclarationName or considering module information when deciding if two declarations are the same. So it may not be a surprise to me if there are missing cases. But it is not a regression. It should be already the case. Issue reports are welcomed. In this patch, I tried to split the big lookup table into a lookup table as before and a module local lookup table, which takes a combination of the ID of the DeclContext and hash value of the primary module name as the key. And refactored `DeclContext::lookup()` method to take the module information. So that a lookup in a DeclContext won't load declarations that are local to **other** modules. And also I think it is already beneficial to split the big lookup table since it may reduce the conflicts during lookups in the hash table. BTW, this patch introduced a **regression** for a reachability rule in C++20 but it was false-negative. See 'clang/test/CXX/module/module.interface/p7.cpp' for details. This patch is not expected to introduce any other regressions for non-c++20-modules users since the module local lookup table should be empty for them. --- On the API side, this patch unfortunately add a maybe-confusing argument `Module *NamedModule` to `ExternalASTSource::FindExternalVisibleDeclsByName()`. People may think we can get the information from the first argument `const DeclContext *DC`. But sadly there are declarations (e.g., namespace) can appear in multiple different modules as a single declaration. So we have to add additional information to indicate this. --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/AST/DeclBase.h | 10 + clang/include/clang/AST/ExternalASTMerger.h | 3 +- clang/include/clang/AST/ExternalASTSource.h | 17 +- .../clang/Sema/MultiplexExternalSemaSource.h | 3 +- .../include/clang/Serialization/ASTBitCodes.h | 6 + clang/include/clang/Serialization/ASTReader.h | 32 +- clang/include/clang/Serialization/ASTWriter.h | 16 +- clang/lib/AST/DeclBase.cpp | 23 +- clang/lib/AST/ExternalASTMerger.cpp | 3 +- clang/lib/AST/ExternalASTSource.cpp | 6 +- clang/lib/Interpreter/CodeCompletion.cpp | 6 +- .../lib/Sema/MultiplexExternalSemaSource.cpp | 7 +- clang/lib/Serialization/ASTReader.cpp | 195 ++++++++++--- clang/lib/Serialization/ASTReaderDecl.cpp | 69 +++-- clang/lib/Serialization/ASTReaderInternals.h | 72 ++++- clang/lib/Serialization/ASTWriter.cpp | 273 ++++++++++++++---- clang/lib/Serialization/ASTWriterDecl.cpp | 13 +- .../basic.scope/basic.scope.namespace/p2.cpp | 4 +- .../test/CXX/module/basic/basic.link/p2.cppm | 3 +- clang/test/CXX/module/module.import/p2.cpp | 10 +- clang/test/CXX/module/module.interface/p7.cpp | 10 +- clang/test/CXX/module/module.reach/p5.cpp | 3 +- .../Reachability-template-default-arg.cpp | 3 +- clang/test/Modules/cxx20-10-1-ex2.cpp | 3 +- clang/test/Modules/deduction-guide3.cppm | 4 +- .../Modules/module-local-with-templates.cppm | 79 +++++ clang/test/Modules/pr90154.cppm | 25 ++ clang/unittests/AST/ExternalASTSourceTest.cpp | 3 +- .../Plugins/ExpressionParser/Clang/ASTUtils.h | 10 +- .../ExpressionParser/Clang/ClangASTSource.cpp | 3 +- .../ExpressionParser/Clang/ClangASTSource.h | 8 +- .../Clang/ClangExternalASTSourceCallbacks.cpp | 3 +- .../Clang/ClangExternalASTSourceCallbacks.h | 3 +- .../AppleObjCRuntime/AppleObjCDeclVendor.cpp | 3 +- 35 files changed, 736 insertions(+), 197 deletions(-) create mode 100644 clang/test/Modules/module-local-with-templates.cppm create mode 100644 clang/test/Modules/pr90154.cppm diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6ac91f43e66d8..c6bc95594f613 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -316,6 +316,8 @@ C++23 Feature Support C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Implemented module level lookup for C++20 modules. (#GH90154) + Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 71ab9178509b2..91177c9a4b51f 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -836,6 +836,10 @@ class alignas(8) Decl { return isFromASTFile() ? getImportedOwningModule() : getLocalOwningModule(); } + /// Get the top level owning named module that owns this declaration if any. + /// \returns nullptr if the declaration is not owned by a named module. + Module *getTopLevelOwningNamedModule() const; + /// Get the module that owns this declaration for linkage purposes. /// There only ever is such a standard C++ module. Module *getOwningModuleForLinkage() const; @@ -2722,6 +2726,12 @@ class DeclContext { bool Deserialize = false) const; private: + /// Lookup all external visible declarations and the external declarations + /// within the same module specified by \c NamedModule. We can't + /// get it from \c this since the same declaration may be declared in + /// multiple modules. e.g., namespace. + lookup_result lookupImpl(DeclarationName Name, Module *NamedModule) const; + /// Whether this declaration context has had externally visible /// storage added since the last lookup. In this case, \c LookupPtr's /// invariant may not hold and needs to be fixed before we perform diff --git a/clang/include/clang/AST/ExternalASTMerger.h b/clang/include/clang/AST/ExternalASTMerger.h index ec4cfbe2175c0..46f187c5e0694 100644 --- a/clang/include/clang/AST/ExternalASTMerger.h +++ b/clang/include/clang/AST/ExternalASTMerger.h @@ -141,7 +141,8 @@ class ExternalASTMerger : public ExternalASTSource { /// Implementation of the ExternalASTSource API. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; /// Implementation of the ExternalASTSource API. void diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 4d7ff822fceb7..ee4ad634977dc 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -51,6 +51,7 @@ class RecordDecl; class Selector; class Stmt; class TagDecl; +class Module; /// Abstract interface for external sources of AST nodes. /// @@ -145,12 +146,20 @@ class ExternalASTSource : public RefCountedBase { /// Find all declarations with the given name in the given context, /// and add them to the context by calling SetExternalVisibleDeclsForName /// or SetNoExternalVisibleDeclsForName. - /// \return \c true if any declarations might have been found, \c false if - /// we definitely have no declarations with tbis name. + /// \param DC the context for lookup. + /// \param Name the name of the declarations to find. + /// \param NamedModule find declarations visible to the given module + /// \c NamedModule . This may be different from owning module of \c DC since + /// there are declarations (e.g., namespace declaration) can appear in + /// multiple modules. + /// + /// \return \c true if any declarations might have been found, and \c false + /// if we definitely have no declarations with this name. /// /// The default implementation of this method is a no-op returning \c false. - virtual bool - FindExternalVisibleDeclsByName(const DeclContext *DC, DeclarationName Name); + virtual bool FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule); /// Load all the external specializations for the Decl \param D if \param /// OnlyPartial is false. Otherwise, load all the external **partial** diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h index 0c92c52854c9e..08d6143f7caaf 100644 --- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h +++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h @@ -95,7 +95,8 @@ class MultiplexExternalSemaSource : public ExternalSemaSource { /// Find all declarations with the given name in the /// given context. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; bool LoadExternalSpecializations(const Decl *D, bool OnlyPartial) override; diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index aac165130b719..40dae25f7b54b 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -738,6 +738,8 @@ enum ASTRecordTypes { CXX_ADDED_TEMPLATE_SPECIALIZATION = 74, CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75, + + UPDATE_MODULE_LOCAL_VISIBLE = 76, }; /// Record types used within a source manager block. @@ -1334,6 +1336,10 @@ enum DeclCode { /// into a DeclContext via DeclContext::lookup. DECL_CONTEXT_VISIBLE, + /// A record containing the set of declarations that are + /// only visible from DeclContext in the same module. + DECL_CONTEXT_MODULE_LOCAL_VISIBLE, + /// A LabelDecl record. DECL_LABEL, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 9f978762a6fb6..ea12adaec3ee8 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -353,6 +353,7 @@ class ASTIdentifierLookupTrait; /// The on-disk hash table(s) used for DeclContext name lookup. struct DeclContextLookupTable; +struct ModuleLocalLookupTable; /// The on-disk hash table(s) used for specialization decls. struct LazySpecializationInfoLookupTable; @@ -523,9 +524,14 @@ class ASTReader /// in the chain. DeclUpdateOffsetsMap DeclUpdateOffsets; + struct LookupBlockOffsets { + uint64_t LexicalOffset; + uint64_t VisibleOffset; + uint64_t ModuleLocalOffset; + }; + using DelayedNamespaceOffsetMapTy = - llvm::DenseMap>; + llvm::DenseMap; /// Mapping from global declaration IDs to the lexical and visible block /// offset for delayed namespace in reduced BMI. @@ -631,6 +637,9 @@ class ASTReader /// Map from a DeclContext to its lookup tables. llvm::DenseMap Lookups; + llvm::DenseMap + ModuleLocalLookups; using SpecLookupTableTy = llvm::DenseMap PendingVisibleUpdates; + llvm::DenseMap + PendingModuleLocalVisibleUpdates; using SpecializationsUpdate = SmallVector; using SpecializationsUpdateMap = @@ -696,7 +707,8 @@ class ASTReader /// Read the record that describes the visible contents of a DC. bool ReadVisibleDeclContextStorage(ModuleFile &M, llvm::BitstreamCursor &Cursor, - uint64_t Offset, GlobalDeclID ID); + uint64_t Offset, GlobalDeclID ID, + bool IsModuleLocal); bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, Decl *D, bool IsPartial); @@ -1132,6 +1144,10 @@ class ASTReader /// Number of visible decl contexts read/total. unsigned NumVisibleDeclContextsRead = 0, TotalVisibleDeclContexts = 0; + /// Number of module local visible decl contexts read/total. + unsigned NumModuleLocalVisibleDeclContexts = 0, + TotalModuleLocalVisibleDeclContexts = 0; + /// Total size of modules, in bits, currently loaded uint64_t TotalModulesSizeInBits = 0; @@ -1444,6 +1460,9 @@ class ASTReader const serialization::reader::DeclContextLookupTable * getLoadedLookupTables(DeclContext *Primary) const; + const serialization::reader::ModuleLocalLookupTable * + getModuleLocalLookupTables(DeclContext *Primary) const; + /// Get the loaded specializations lookup tables for \p D, /// if any. serialization::reader::LazySpecializationInfoLookupTable * @@ -2119,7 +2138,8 @@ class ASTReader /// The current implementation of this method just loads the entire /// lookup table as unmaterialized references. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; /// Read all of the declarations lexically stored in a /// declaration context. @@ -2607,6 +2627,10 @@ inline bool shouldSkipCheckingODR(const Decl *D) { (D->isFromGlobalModule() || D->isFromHeaderUnit()); } +/// Calculate a hash value for the primary module name of the given module. +/// \returns std::nullopt if M is not a C++ standard module. +std::optional getPrimaryModuleHash(const Module *M); + } // namespace clang #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index adb7cce522a80..53b09cc914392 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -492,6 +492,10 @@ class ASTWriter : public ASTDeserializationListener, /// file. unsigned NumVisibleDeclContexts = 0; + /// The number of module local visible declcontexts written to the AST + /// file. + unsigned NumModuleLocalDeclContexts = 0; + /// A mapping from each known submodule to its ID number, which will /// be a positive integer. llvm::DenseMap SubmoduleIDs; @@ -587,11 +591,15 @@ class ASTWriter : public ASTDeserializationListener, uint64_t WriteSpecializationInfoLookupTable( const NamedDecl *D, llvm::SmallVectorImpl &Specializations, bool IsPartial); - void GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, - llvm::SmallVectorImpl &LookupTable); + void + GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, + llvm::SmallVectorImpl &LookupTable, + llvm::SmallVectorImpl &ModuleLocalLookupTable); uint64_t WriteDeclContextLexicalBlock(ASTContext &Context, const DeclContext *DC); - uint64_t WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC); + void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, + uint64_t &VisibleBlockOffset, + uint64_t &ModuleLocalBlockOffset); void WriteTypeDeclOffsets(); void WriteFileDeclIDsMap(); void WriteComments(ASTContext &Context); @@ -624,7 +632,9 @@ class ASTWriter : public ASTDeserializationListener, unsigned DeclParmVarAbbrev = 0; unsigned DeclContextLexicalAbbrev = 0; unsigned DeclContextVisibleLookupAbbrev = 0; + unsigned DeclModuleLocalVisibleLookupAbbrev = 0; unsigned UpdateVisibleAbbrev = 0; + unsigned ModuleLocalUpdateVisibleAbbrev = 0; unsigned DeclRecordAbbrev = 0; unsigned DeclTypedefAbbrev = 0; unsigned DeclVarAbbrev = 0; diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index fb701f76231bc..42daaa4f3dcc3 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1850,15 +1850,28 @@ void DeclContext::buildLookupImpl(DeclContext *DCtx, bool Internal) { } } +Module *Decl::getTopLevelOwningNamedModule() const { + if (getOwningModule() && + getOwningModule()->getTopLevelModule()->isNamedModule()) + return getOwningModule()->getTopLevelModule(); + + return nullptr; +} + DeclContext::lookup_result DeclContext::lookup(DeclarationName Name) const { + return lookupImpl(Name, cast(this)->getTopLevelOwningNamedModule()); +} + +DeclContext::lookup_result DeclContext::lookupImpl(DeclarationName Name, + Module *NamedModule) const { // For transparent DeclContext, we should lookup in their enclosing context. if (getDeclKind() == Decl::LinkageSpec || getDeclKind() == Decl::Export) - return getParent()->lookup(Name); + return getParent()->lookupImpl(Name, NamedModule); const DeclContext *PrimaryContext = getPrimaryContext(); if (PrimaryContext != this) - return PrimaryContext->lookup(Name); + return PrimaryContext->lookupImpl(Name, NamedModule); // If we have an external source, ensure that any later redeclarations of this // context have been loaded, since they may add names to the result of this @@ -1889,7 +1902,8 @@ DeclContext::lookup(DeclarationName Name) const { if (!R.second && !R.first->second.hasExternalDecls()) return R.first->second.getLookupResult(); - if (Source->FindExternalVisibleDeclsByName(this, Name) || !R.second) { + if (Source->FindExternalVisibleDeclsByName(this, Name, NamedModule) || + !R.second) { if (StoredDeclsMap *Map = LookupPtr) { StoredDeclsMap::iterator I = Map->find(Name); if (I != Map->end()) @@ -2115,7 +2129,8 @@ void DeclContext::makeDeclVisibleInContextImpl(NamedDecl *D, bool Internal) { if (ExternalASTSource *Source = getParentASTContext().getExternalSource()) if (hasExternalVisibleStorage() && Map->find(D->getDeclName()) == Map->end()) - Source->FindExternalVisibleDeclsByName(this, D->getDeclName()); + Source->FindExternalVisibleDeclsByName( + this, D->getDeclName(), D->getTopLevelOwningNamedModule()); // Insert this declaration into the map. StoredDeclsList &DeclNameEntries = (*Map)[D->getDeclName()]; diff --git a/clang/lib/AST/ExternalASTMerger.cpp b/clang/lib/AST/ExternalASTMerger.cpp index 7f7816e1b10ea..a33f6e3447679 100644 --- a/clang/lib/AST/ExternalASTMerger.cpp +++ b/clang/lib/AST/ExternalASTMerger.cpp @@ -472,7 +472,8 @@ static bool importSpecializationsIfNeeded(Decl *D, ASTImporter *Importer) { } bool ExternalASTMerger::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { + DeclarationName Name, + Module *NamedModule) { llvm::SmallVector Decls; llvm::SmallVector Candidates; diff --git a/clang/lib/AST/ExternalASTSource.cpp b/clang/lib/AST/ExternalASTSource.cpp index 543846c0093af..4a29f4944f73c 100644 --- a/clang/lib/AST/ExternalASTSource.cpp +++ b/clang/lib/AST/ExternalASTSource.cpp @@ -90,9 +90,9 @@ ExternalASTSource::GetExternalCXXBaseSpecifiers(uint64_t Offset) { return nullptr; } -bool -ExternalASTSource::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { +bool ExternalASTSource::FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule) { return false; } diff --git a/clang/lib/Interpreter/CodeCompletion.cpp b/clang/lib/Interpreter/CodeCompletion.cpp index bbc8830d76bc0..9092d4705ca58 100644 --- a/clang/lib/Interpreter/CodeCompletion.cpp +++ b/clang/lib/Interpreter/CodeCompletion.cpp @@ -228,7 +228,8 @@ class ExternalSource : public clang::ExternalASTSource { ExternalSource(ASTContext &ChildASTCtxt, FileManager &ChildFM, ASTContext &ParentASTCtxt, FileManager &ParentFM); bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; void completeVisibleDeclsMap(const clang::DeclContext *childDeclContext) override; }; @@ -271,7 +272,8 @@ ExternalSource::ExternalSource(ASTContext &ChildASTCtxt, FileManager &ChildFM, } bool ExternalSource::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { + DeclarationName Name, + Module *NamedModule) { IdentifierTable &ParentIdTable = ParentASTCtxt.Idents; diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp index 54944267b4868..c19a0f980c1e9 100644 --- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp +++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp @@ -107,11 +107,12 @@ MultiplexExternalSemaSource::hasExternalDefinitions(const Decl *D) { return EK_ReplyHazy; } -bool MultiplexExternalSemaSource:: -FindExternalVisibleDeclsByName(const DeclContext *DC, DeclarationName Name) { +bool MultiplexExternalSemaSource::FindExternalVisibleDeclsByName( + const DeclContext *DC, DeclarationName Name, Module *NamedModule) { bool AnyDeclsFound = false; for (size_t i = 0; i < Sources.size(); ++i) - AnyDeclsFound |= Sources[i]->FindExternalVisibleDeclsByName(DC, Name); + AnyDeclsFound |= + Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule); return AnyDeclsFound; } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 7361cace49dd7..06853a227215e 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1235,7 +1235,7 @@ unsigned DeclarationNameKey::getHash() const { } ModuleFile * -ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) { +ASTDeclContextNameLookupTraitBase::ReadFileRef(const unsigned char *&d) { using namespace llvm::support; uint32_t ModuleFileID = @@ -1244,12 +1244,12 @@ ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) { } std::pair -ASTDeclContextNameLookupTrait::ReadKeyDataLength(const unsigned char *&d) { +ASTDeclContextNameLookupTraitBase::ReadKeyDataLength(const unsigned char *&d) { return readULEBKeyDataLength(d); } -ASTDeclContextNameLookupTrait::internal_key_type -ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { +DeclarationNameKey +ASTDeclContextNameLookupTraitBase::ReadKeyBase(const unsigned char *&d) { using namespace llvm::support; auto Kind = (DeclarationName::NameKind)*d++; @@ -1283,10 +1283,13 @@ ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { return DeclarationNameKey(Kind, Data); } -void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, - const unsigned char *d, - unsigned DataLen, - data_type_builder &Val) { +ASTDeclContextNameLookupTrait::internal_key_type +ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { + return ReadKeyBase(d); +} + +void ASTDeclContextNameLookupTraitBase::ReadDataIntoImpl( + const unsigned char *d, unsigned DataLen, data_type_builder &Val) { using namespace llvm::support; for (unsigned NumDecls = DataLen / sizeof(DeclID); NumDecls; --NumDecls) { @@ -1296,6 +1299,47 @@ void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, } } +void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, + const unsigned char *d, + unsigned DataLen, + data_type_builder &Val) { + ReadDataIntoImpl(d, DataLen, Val); +} + +ModuleLocalNameLookupTrait::hash_value_type +ModuleLocalNameLookupTrait::ComputeHash(const internal_key_type &Key) { + llvm::FoldingSetNodeID ID; + ID.AddInteger(Key.first.getHash()); + ID.AddInteger(Key.second); + return ID.computeStableHash(); +} + +ModuleLocalNameLookupTrait::internal_key_type +ModuleLocalNameLookupTrait::GetInternalKey(const external_key_type &Key) { + DeclarationNameKey Name(Key.first); + + std::optional ModuleHash = getPrimaryModuleHash(Key.second); + if (!ModuleHash) + return {Name, 0}; + + return {Name, *ModuleHash}; +} + +ModuleLocalNameLookupTrait::internal_key_type +ModuleLocalNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { + DeclarationNameKey Name = ReadKeyBase(d); + unsigned PrimaryModuleHash = + llvm::support::endian::readNext(d); + return {Name, PrimaryModuleHash}; +} + +void ModuleLocalNameLookupTrait::ReadDataInto(internal_key_type, + const unsigned char *d, + unsigned DataLen, + data_type_builder &Val) { + ReadDataIntoImpl(d, DataLen, Val); +} + ModuleFile * LazySpecializationInfoLookupTrait::ReadFileRef(const unsigned char *&d) { using namespace llvm::support; @@ -1383,8 +1427,8 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M, bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, BitstreamCursor &Cursor, - uint64_t Offset, - GlobalDeclID ID) { + uint64_t Offset, GlobalDeclID ID, + bool IsModuleLocal) { assert(Offset != 0); SavedStreamPosition SavedPosition(Cursor); @@ -1408,15 +1452,22 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, return true; } unsigned RecCode = MaybeRecCode.get(); - if (RecCode != DECL_CONTEXT_VISIBLE) { + if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) { Error("Expected visible lookup table block"); return true; } + if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { + Error("Expected module local visible lookup table block"); + return true; + } // We can't safely determine the primary context yet, so delay attaching the // lookup table until we're done with recursive deserialization. auto *Data = (const unsigned char*)Blob.data(); - PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + if (!IsModuleLocal) + PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + else + PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data}); return false; } @@ -3549,6 +3600,19 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; } + case UPDATE_MODULE_LOCAL_VISIBLE: { + unsigned Idx = 0; + GlobalDeclID ID = ReadDeclID(F, Record, Idx); + auto *Data = (const unsigned char *)Blob.data(); + PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&F, Data}); + // If we've already loaded the decl, perform the updates when we finish + // loading this block. + if (Decl *D = GetExistingDecl(ID)) + PendingUpdateRecords.push_back( + PendingUpdateRecord(ID, D, /*JustLoaded=*/false)); + break; + } + case CXX_ADDED_TEMPLATE_SPECIALIZATION: { unsigned Idx = 0; GlobalDeclID ID = ReadDeclID(F, Record, Idx); @@ -3652,6 +3716,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, TotalNumMacros += Record[1]; TotalLexicalDeclContexts += Record[2]; TotalVisibleDeclContexts += Record[3]; + TotalModuleLocalVisibleDeclContexts += Record[4]; break; case UNUSED_FILESCOPED_DECLS: @@ -3937,7 +4002,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: { - if (Record.size() % 3 != 0) + if (Record.size() % 4 != 0) return llvm::createStringError( std::errc::illegal_byte_sequence, "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST " @@ -3953,8 +4018,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, uint64_t LocalVisibleOffset = Record[I++]; uint64_t VisibleOffset = LocalVisibleOffset ? BaseOffset + LocalVisibleOffset : 0; + uint64_t LocalModuleLocalOffset = Record[I++]; + uint64_t ModuleLocalOffset = + LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0; - DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset}; + DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset, + ModuleLocalOffset}; assert(!GetExistingDecl(ID) && "We shouldn't load the namespace in the front of delayed " @@ -8366,31 +8435,42 @@ void ASTReader::FindFileRegionDecls(FileID File, *DInfo.Mod, LocalDeclID::get(*this, *DInfo.Mod, *DIt)))); } -bool -ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { +bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule) { assert(DC->hasExternalVisibleStorage() && DC == DC->getPrimaryContext() && "DeclContext has no visible decls in storage"); if (!Name) return false; - auto It = Lookups.find(DC); - if (It == Lookups.end()) - return false; - - Deserializing LookupResults(this); - // Load the list of declarations. SmallVector Decls; llvm::SmallPtrSet Found; - for (GlobalDeclID ID : It->second.Table.find(Name)) { - NamedDecl *ND = cast(GetDecl(ID)); - if (ND->getDeclName() == Name && Found.insert(ND).second) - Decls.push_back(ND); + Deserializing LookupResults(this); + + // FIXME: Clear the redundancy with templated lambda in C++20 when that's + // available. + if (auto It = Lookups.find(DC); It != Lookups.end()) { + ++NumVisibleDeclContextsRead; + for (GlobalDeclID ID : It->second.Table.find(Name)) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } + + if (NamedModule) { + if (auto It = ModuleLocalLookups.find(DC); It != ModuleLocalLookups.end()) { + ++NumModuleLocalVisibleDeclContexts; + for (GlobalDeclID ID : It->second.Table.find({Name, NamedModule})) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } } - ++NumVisibleDeclContextsRead; SetExternalVisibleDeclsForName(DC, Name, Decls); return !Decls.empty(); } @@ -8399,18 +8479,25 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) { if (!DC->hasExternalVisibleStorage()) return; - auto It = Lookups.find(DC); - assert(It != Lookups.end() && - "have external visible storage but no lookup tables"); - DeclsMap Decls; - for (GlobalDeclID ID : It->second.Table.findAll()) { - NamedDecl *ND = cast(GetDecl(ID)); - Decls[ND->getDeclName()].push_back(ND); - } + auto findAll = [&](auto &LookupTables, unsigned &NumRead) { + auto It = LookupTables.find(DC); + if (It == LookupTables.end()) + return; - ++NumVisibleDeclContextsRead; + NumRead++; + + for (GlobalDeclID ID : It->second.Table.findAll()) { + NamedDecl *ND = cast(GetDecl(ID)); + Decls[ND->getDeclName()].push_back(ND); + } + + // FIXME: Why a PCH test is failing if we remove the iterator after findAll? + }; + + findAll(Lookups, NumVisibleDeclContextsRead); + findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts); for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) { SetExternalVisibleDeclsForName(DC, I->first, I->second); @@ -8424,6 +8511,12 @@ ASTReader::getLoadedLookupTables(DeclContext *Primary) const { return I == Lookups.end() ? nullptr : &I->second; } +const serialization::reader::ModuleLocalLookupTable * +ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const { + auto I = ModuleLocalLookups.find(Primary); + return I == ModuleLocalLookups.end() ? nullptr : &I->second; +} + serialization::reader::LazySpecializationInfoLookupTable * ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) { assert(D->isCanonicalDecl()); @@ -8533,6 +8626,12 @@ void ASTReader::PrintStats() { NumVisibleDeclContextsRead, TotalVisibleDeclContexts, ((float)NumVisibleDeclContextsRead/TotalVisibleDeclContexts * 100)); + if (TotalModuleLocalVisibleDeclContexts) + std::fprintf( + stderr, " %u/%u module local visible declcontexts read (%f%%)\n", + NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts, + ((float)NumModuleLocalVisibleDeclContexts / + TotalModuleLocalVisibleDeclContexts * 100)); if (TotalNumMethodPoolEntries) std::fprintf(stderr, " %u/%u method pool entries read (%f%%)\n", NumMethodPoolEntriesRead, TotalNumMethodPoolEntries, @@ -12639,3 +12738,25 @@ void ASTRecordReader::readOpenACCClauseList( for (unsigned I = 0; I < Clauses.size(); ++I) Clauses[I] = readOpenACCClause(); } + +static unsigned getStableHashForModuleName(StringRef PrimaryModuleName) { + // TODO: Maybe it is better to check PrimaryModuleName is a valid + // module name? + llvm::FoldingSetNodeID ID; + ID.AddString(PrimaryModuleName); + return ID.computeStableHash(); +} + +std::optional clang::getPrimaryModuleHash(const Module *M) { + if (!M) + return std::nullopt; + + if (M->isHeaderLikeModule()) + return std::nullopt; + + if (M->isGlobalModule()) + return std::nullopt; + + StringRef PrimaryModuleName = M->getPrimaryModuleInterfaceName(); + return getStableHashForModuleName(PrimaryModuleName); +} diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 95abd75920c8f..1c51a7b5e460f 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -413,7 +413,8 @@ class ASTDeclReader : public DeclVisitor { void VisitEmptyDecl(EmptyDecl *D); void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); - std::pair VisitDeclContext(DeclContext *DC); + void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, + uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset); template RedeclarableResult VisitRedeclarable(Redeclarable *D); @@ -1855,7 +1856,10 @@ void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) { void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) { VisitNamedDecl(D); - VisitDeclContext(D); + uint64_t LexicalOffset = 0; + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset); D->IsCBuffer = Record.readBool(); D->KwLoc = readSourceLocation(); D->LBraceLoc = readSourceLocation(); @@ -2764,11 +2768,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl( mergeMergeable(D); } -std::pair -ASTDeclReader::VisitDeclContext(DeclContext *DC) { - uint64_t LexicalOffset = ReadLocalOffset(); - uint64_t VisibleOffset = ReadLocalOffset(); - return std::make_pair(LexicalOffset, VisibleOffset); +void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, + uint64_t &VisibleOffset, + uint64_t &ModuleLocalOffset) { + LexicalOffset = ReadLocalOffset(); + VisibleOffset = ReadLocalOffset(); + ModuleLocalOffset = ReadLocalOffset(); } template @@ -3869,6 +3874,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { switch ((DeclCode)MaybeDeclCode.get()) { case DECL_CONTEXT_LEXICAL: case DECL_CONTEXT_VISIBLE: + case DECL_CONTEXT_MODULE_LOCAL_VISIBLE: case DECL_SPECIALIZATIONS: case DECL_PARTIAL_SPECIALIZATIONS: llvm_unreachable("Record cannot be de-serialized with readDeclRecord"); @@ -4176,21 +4182,35 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { // If this declaration is also a declaration context, get the // offsets for its tables of lexical and visible declarations. if (auto *DC = dyn_cast(D)) { - std::pair Offsets = Reader.VisitDeclContext(DC); + uint64_t LexicalOffset = 0; + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + + Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, + ModuleLocalOffset); // Get the lexical and visible block for the delayed namespace. // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap. // But it may be more efficient to filter the other cases. - if (!Offsets.first && !Offsets.second && isa(D)) + if (!LexicalOffset && !VisibleOffset && !ModuleLocalOffset && + isa(D)) if (auto Iter = DelayedNamespaceOffsetMap.find(ID); - Iter != DelayedNamespaceOffsetMap.end()) - Offsets = Iter->second; + Iter != DelayedNamespaceOffsetMap.end()) { + LexicalOffset = Iter->second.LexicalOffset; + VisibleOffset = Iter->second.VisibleOffset; + ModuleLocalOffset = Iter->second.ModuleLocalOffset; + } - if (Offsets.first && - ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, Offsets.first, DC)) + if (LexicalOffset && + ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC)) + return nullptr; + if (VisibleOffset && + ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID, + /*IsModuleLocal=*/false)) return nullptr; - if (Offsets.second && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, Offsets.second, ID)) + if (ModuleLocalOffset && + ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset, + ID, /*IsModuleLocal=*/true)) return nullptr; } assert(Record.getIdx() == Record.size()); @@ -4328,8 +4348,8 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { } // Load the pending visible updates for this decl context, if it has any. - auto I = PendingVisibleUpdates.find(ID); - if (I != PendingVisibleUpdates.end()) { + if (auto I = PendingVisibleUpdates.find(ID); + I != PendingVisibleUpdates.end()) { auto VisibleUpdates = std::move(I->second); PendingVisibleUpdates.erase(I); @@ -4341,6 +4361,21 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { DC->setHasExternalVisibleStorage(true); } + if (auto I = PendingModuleLocalVisibleUpdates.find(ID); + I != PendingModuleLocalVisibleUpdates.end()) { + auto ModuleLocalVisibleUpdates = std::move(I->second); + PendingModuleLocalVisibleUpdates.erase(I); + + auto *DC = cast(D)->getPrimaryContext(); + for (const auto &Update : ModuleLocalVisibleUpdates) + ModuleLocalLookups[DC].Table.add( + Update.Mod, Update.Data, + reader::ModuleLocalNameLookupTrait(*this, *Update.Mod)); + // NOTE: Can we optimize the case that the data being loaded + // is not related to current module? + DC->setHasExternalVisibleStorage(true); + } + // Load any pending related decls. if (D->isCanonicalDecl()) { if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) { diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h index be0d22d1f4094..4be2b2323ec40 100644 --- a/clang/lib/Serialization/ASTReaderInternals.h +++ b/clang/lib/Serialization/ASTReaderInternals.h @@ -31,6 +31,7 @@ class FileEntry; struct HeaderFileInfo; class HeaderSearch; class ObjCMethodDecl; +class Module; namespace serialization { @@ -38,9 +39,8 @@ class ModuleFile; namespace reader { -/// Class that performs name lookup into a DeclContext stored -/// in an AST file. -class ASTDeclContextNameLookupTrait { +class ASTDeclContextNameLookupTraitBase { +protected: ASTReader &Reader; ModuleFile &F; @@ -80,11 +80,37 @@ class ASTDeclContextNameLookupTrait { using offset_type = unsigned; using file_type = ModuleFile *; - using external_key_type = DeclarationName; - using internal_key_type = DeclarationNameKey; +protected: + explicit ASTDeclContextNameLookupTraitBase(ASTReader &Reader, ModuleFile &F) + : Reader(Reader), F(F) {} + +public: + static std::pair + ReadKeyDataLength(const unsigned char *&d); + + void ReadDataIntoImpl(const unsigned char *d, unsigned DataLen, + data_type_builder &Val); + + static void MergeDataInto(const data_type &From, data_type_builder &To) { + To.Data.reserve(To.Data.size() + From.size()); + for (GlobalDeclID ID : From) + To.insert(ID); + } + + file_type ReadFileRef(const unsigned char *&d); + + DeclarationNameKey ReadKeyBase(const unsigned char *&d); +}; +/// Class that performs name lookup into a DeclContext stored +/// in an AST file. +class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: explicit ASTDeclContextNameLookupTrait(ASTReader &Reader, ModuleFile &F) - : Reader(Reader), F(F) {} + : ASTDeclContextNameLookupTraitBase(Reader, F) {} + + using external_key_type = DeclarationName; + using internal_key_type = DeclarationNameKey; static bool EqualKey(const internal_key_type &a, const internal_key_type &b) { return a == b; @@ -98,25 +124,39 @@ class ASTDeclContextNameLookupTrait { return Name; } - static std::pair - ReadKeyDataLength(const unsigned char *&d); - internal_key_type ReadKey(const unsigned char *d, unsigned); void ReadDataInto(internal_key_type, const unsigned char *d, unsigned DataLen, data_type_builder &Val); +}; - static void MergeDataInto(const data_type &From, data_type_builder &To) { - To.Data.reserve(To.Data.size() + From.size()); - for (GlobalDeclID ID : From) - To.insert(ID); +struct DeclContextLookupTable { + MultiOnDiskHashTable Table; +}; + +class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + explicit ModuleLocalNameLookupTrait(ASTReader &Reader, ModuleFile &F) + : ASTDeclContextNameLookupTraitBase(Reader, F) {} + + using external_key_type = std::pair; + using internal_key_type = std::pair; + + static bool EqualKey(const internal_key_type &a, const internal_key_type &b) { + return a == b; } - file_type ReadFileRef(const unsigned char *&d); + static hash_value_type ComputeHash(const internal_key_type &Key); + static internal_key_type GetInternalKey(const external_key_type &Key); + + internal_key_type ReadKey(const unsigned char *d, unsigned); + + void ReadDataInto(internal_key_type, const unsigned char *d, unsigned DataLen, + data_type_builder &Val); }; -struct DeclContextLookupTable { - MultiOnDiskHashTable Table; +struct ModuleLocalLookupTable { + MultiOnDiskHashTable Table; }; using LazySpecializationInfo = GlobalDeclID; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 345d496a93312..a6f8c6009f1ff 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1088,6 +1088,7 @@ void ASTWriter::WriteBlockInfoBlock() { RECORD(DECL_BLOCK); RECORD(DECL_CONTEXT_LEXICAL); RECORD(DECL_CONTEXT_VISIBLE); + RECORD(DECL_CONTEXT_MODULE_LOCAL_VISIBLE); RECORD(DECL_NAMESPACE); RECORD(DECL_NAMESPACE_ALIAS); RECORD(DECL_USING); @@ -4024,15 +4025,13 @@ void ASTWriter::handleVTable(CXXRecordDecl *RD) { namespace { -// Trait used for the on-disk hash table used in the method pool. -class ASTDeclContextNameLookupTrait { +class ASTDeclContextNameLookupTraitBase { +protected: ASTWriter &Writer; - llvm::SmallVector DeclIDs; + using DeclIDsTy = llvm::SmallVector; + DeclIDsTy DeclIDs; public: - using key_type = DeclarationNameKey; - using key_type_ref = key_type; - /// A start and end index into DeclIDs, representing a sequence of decls. using data_type = std::pair; using data_type_ref = const data_type &; @@ -4040,31 +4039,11 @@ class ASTDeclContextNameLookupTrait { using hash_value_type = unsigned; using offset_type = unsigned; - explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) : Writer(Writer) {} - - template - data_type getData(const Coll &Decls) { - unsigned Start = DeclIDs.size(); - for (NamedDecl *D : Decls) { - NamedDecl *DeclForLocalLookup = - getDeclForLocalLookup(Writer.getLangOpts(), D); - - if (Writer.getDoneWritingDeclsAndTypes() && - !Writer.wasDeclEmitted(DeclForLocalLookup)) - continue; - - // Try to avoid writing internal decls to reduced BMI. - // See comments in ASTWriter::WriteDeclContextLexicalBlock for details. - if (Writer.isGeneratingReducedBMI() && - !DeclForLocalLookup->isFromExplicitGlobalModule() && - IsInternalDeclFromFileContext(DeclForLocalLookup)) - continue; - - DeclIDs.push_back(Writer.GetDeclRef(DeclForLocalLookup)); - } - return std::make_pair(Start, DeclIDs.size()); - } +protected: + explicit ASTDeclContextNameLookupTraitBase(ASTWriter &Writer) + : Writer(Writer) {} +public: data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) { unsigned Start = DeclIDs.size(); DeclIDs.insert( @@ -4074,14 +4053,6 @@ class ASTDeclContextNameLookupTrait { return std::make_pair(Start, DeclIDs.size()); } - static bool EqualKey(key_type_ref a, key_type_ref b) { - return a == b; - } - - hash_value_type ComputeHash(DeclarationNameKey Name) { - return Name.getHash(); - } - void EmitFileRef(raw_ostream &Out, ModuleFile *F) const { assert(Writer.hasChain() && "have reference to loaded module file but no chain?"); @@ -4092,9 +4063,9 @@ class ASTDeclContextNameLookupTrait { llvm::endianness::little); } - std::pair EmitKeyDataLength(raw_ostream &Out, - DeclarationNameKey Name, - data_type_ref Lookup) { + std::pair EmitKeyDataLengthBase(raw_ostream &Out, + DeclarationNameKey Name, + data_type_ref Lookup) { unsigned KeyLen = 1; switch (Name.getKind()) { case DeclarationName::Identifier: @@ -4120,10 +4091,10 @@ class ASTDeclContextNameLookupTrait { // length of DeclIDs. unsigned DataLen = sizeof(DeclID) * (Lookup.second - Lookup.first); - return emitULEBKeyDataLength(KeyLen, DataLen, Out); + return {KeyLen, DataLen}; } - void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) { + void EmitKeyBase(raw_ostream &Out, DeclarationNameKey Name) { using namespace llvm::support; endian::Writer LE(Out, llvm::endianness::little); @@ -4154,8 +4125,7 @@ class ASTDeclContextNameLookupTrait { llvm_unreachable("Invalid name kind?"); } - void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, - unsigned DataLen) { + void EmitDataBase(raw_ostream &Out, data_type Lookup, unsigned DataLen) { using namespace llvm::support; endian::Writer LE(Out, llvm::endianness::little); @@ -4166,6 +4136,129 @@ class ASTDeclContextNameLookupTrait { } }; +class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + using primary_module_hash_type = unsigned; + + using key_type = std::pair; + using key_type_ref = key_type; + + explicit ModuleLocalNameLookupTrait(ASTWriter &Writer) + : ASTDeclContextNameLookupTraitBase(Writer) {} + + data_type getData(const DeclIDsTy &LocalIDs) { + unsigned Start = DeclIDs.size(); + for (auto ID : LocalIDs) + DeclIDs.push_back(ID); + return std::make_pair(Start, DeclIDs.size()); + } + + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } + + hash_value_type ComputeHash(key_type Key) { + llvm::FoldingSetNodeID ID; + ID.AddInteger(Key.first.getHash()); + ID.AddInteger(Key.second); + return ID.computeStableHash(); + } + + std::pair + EmitKeyDataLength(raw_ostream &Out, key_type Key, data_type_ref Lookup) { + auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Key.first, Lookup); + KeyLen += sizeof(Key.second); + return emitULEBKeyDataLength(KeyLen, DataLen, Out); + } + + void EmitKey(raw_ostream &Out, key_type Key, unsigned) { + EmitKeyBase(Out, Key.first); + llvm::support::endian::Writer LE(Out, llvm::endianness::little); + LE.write(Key.second); + } + + void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, + unsigned DataLen) { + EmitDataBase(Out, Lookup, DataLen); + } +}; + +// Trait used for the on-disk hash table used in the method pool. +class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + using ModuleLocalDeclsMapTy = + llvm::DenseMap; + +private: + ModuleLocalDeclsMapTy ModuleLocalDeclsMap; + +public: + using key_type = DeclarationNameKey; + using key_type_ref = key_type; + + explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) + : ASTDeclContextNameLookupTraitBase(Writer) {} + + template data_type getData(const Coll &Decls) { + unsigned Start = DeclIDs.size(); + for (NamedDecl *D : Decls) { + NamedDecl *DeclForLocalLookup = + getDeclForLocalLookup(Writer.getLangOpts(), D); + + if (Writer.getDoneWritingDeclsAndTypes() && + !Writer.wasDeclEmitted(DeclForLocalLookup)) + continue; + + // Try to avoid writing internal decls to reduced BMI. + // See comments in ASTWriter::WriteDeclContextLexicalBlock for details. + if (Writer.isGeneratingReducedBMI() && + !DeclForLocalLookup->isFromExplicitGlobalModule() && + IsInternalDeclFromFileContext(DeclForLocalLookup)) + continue; + + auto ID = Writer.GetDeclRef(DeclForLocalLookup); + + if (D->getFormalLinkage() == Linkage::Module) { + if (std::optional PrimaryModuleHash = + getPrimaryModuleHash(D->getOwningModule())) { + auto Key = std::make_pair(D->getDeclName(), *PrimaryModuleHash); + auto Iter = ModuleLocalDeclsMap.find(Key); + if (Iter == ModuleLocalDeclsMap.end()) + ModuleLocalDeclsMap.insert({Key, DeclIDsTy{ID}}); + else + Iter->second.push_back(ID); + continue; + } + } + + DeclIDs.push_back(ID); + } + return std::make_pair(Start, DeclIDs.size()); + } + + const ModuleLocalDeclsMapTy &getModuleLocalDecls() { + return ModuleLocalDeclsMap; + } + + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } + + hash_value_type ComputeHash(key_type Name) { return Name.getHash(); } + + std::pair EmitKeyDataLength(raw_ostream &Out, + DeclarationNameKey Name, + data_type_ref Lookup) { + auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Name, Lookup); + return emitULEBKeyDataLength(KeyLen, DataLen, Out); + } + + void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) { + return EmitKeyBase(Out, Name); + } + + void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, + unsigned DataLen) { + EmitDataBase(Out, Lookup, DataLen); + } +}; + } // namespace namespace { @@ -4371,7 +4464,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer, void ASTWriter::GenerateNameLookupTable( ASTContext &Context, const DeclContext *ConstDC, - llvm::SmallVectorImpl &LookupTable) { + llvm::SmallVectorImpl &LookupTable, + llvm::SmallVectorImpl &ModuleLocalLookupTable) { assert(!ConstDC->hasLazyLocalLexicalLookups() && !ConstDC->hasLazyExternalLexicalLookups() && "must call buildLookups first"); @@ -4553,6 +4647,28 @@ void ASTWriter::GenerateNameLookupTable( // merged table if there is one. auto *Lookups = Chain ? Chain->getLoadedLookupTables(DC) : nullptr; Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr); + + const auto &ModuleLocalDecls = Trait.getModuleLocalDecls(); + if (ModuleLocalDecls.empty()) + return; + + MultiOnDiskHashTableGenerator + ModuleLocalLookupGenerator; + ModuleLocalNameLookupTrait ModuleLocalTrait(*this); + + for (const auto &ModuleLocalIter : ModuleLocalDecls) { + const auto &Key = ModuleLocalIter.first; + const auto &IDs = ModuleLocalIter.second; + ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), + ModuleLocalTrait); + } + + auto *ModuleLocalLookups = + Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; + ModuleLocalLookupGenerator.emit( + ModuleLocalLookupTable, ModuleLocalTrait, + ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); } /// Write the block containing all of the declaration IDs @@ -4560,8 +4676,10 @@ void ASTWriter::GenerateNameLookupTable( /// /// \returns the offset of the DECL_CONTEXT_VISIBLE block within the /// bitstream, or 0 if no block was written. -uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, - DeclContext *DC) { +void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, + DeclContext *DC, + uint64_t &VisibleBlockOffset, + uint64_t &ModuleLocalBlockOffset) { // If we imported a key declaration of this namespace, write the visible // lookup results as an update record for it rather than including them // on this declaration. We will only look at key declarations on reload. @@ -4571,7 +4689,7 @@ uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, for (auto *Prev = cast(DC)->getPreviousDecl(); Prev; Prev = Prev->getPreviousDecl()) if (!Prev->isFromASTFile()) - return 0; + return; // Note that we need to emit an update record for the primary context. UpdatedDeclContexts.insert(DC->getPrimaryContext()); @@ -4620,41 +4738,53 @@ uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, } } - return 0; + return; } if (DC->getPrimaryContext() != DC) - return 0; + return; // Skip contexts which don't support name lookup. if (!DC->isLookupContext()) - return 0; + return; // If not in C++, we perform name lookup for the translation unit via the // IdentifierInfo chains, don't bother to build a visible-declarations table. if (DC->isTranslationUnit() && !Context.getLangOpts().CPlusPlus) - return 0; + return; // Serialize the contents of the mapping used for lookup. Note that, // although we have two very different code paths, the serialized // representation is the same for both cases: a declaration name, // followed by a size, followed by references to the visible // declarations that have that name. - uint64_t Offset = Stream.GetCurrentBitNo(); StoredDeclsMap *Map = DC->buildLookup(); if (!Map || Map->empty()) - return 0; + return; + VisibleBlockOffset = Stream.GetCurrentBitNo(); // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; - GenerateNameLookupTable(Context, DC, LookupTable); + SmallString<4096> ModuleLocalLookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); // Write the lookup table RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE}; Stream.EmitRecordWithBlob(DeclContextVisibleLookupAbbrev, Record, LookupTable); ++NumVisibleDeclContexts; - return Offset; + + if (ModuleLocalLookupTable.empty()) + return; + + ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); + assert(ModuleLocalBlockOffset > VisibleBlockOffset); + // Write the lookup table + RecordData::value_type ModuleLocalRecord[] = { + DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, + ModuleLocalRecord, ModuleLocalLookupTable); + ++NumModuleLocalDeclContexts; } /// Write an UPDATE_VISIBLE block for the given context. @@ -4671,7 +4801,8 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; - GenerateNameLookupTable(Context, DC, LookupTable); + SmallString<4096> ModuleLocalLookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); // If we're updating a namespace, select a key declaration as the key for the // update record; those are the only ones that will be checked on reload. @@ -4682,6 +4813,15 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, RecordData::value_type Record[] = {UPDATE_VISIBLE, getDeclID(cast(DC)).getRawValue()}; Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable); + + if (ModuleLocalLookupTable.empty()) + return; + + // Write the module local lookup table + RecordData::value_type ModuleLocalRecord[] = { + UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, + ModuleLocalLookupTable); } /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions. @@ -5865,7 +6005,8 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot, // Some simple statistics RecordData::value_type Record[] = { - NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts}; + NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts, + NumModuleLocalDeclContexts}; Stream.EmitRecord(STATISTICS, Record); Stream.ExitBlock(); Stream.FlushToWord(); @@ -5942,7 +6083,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { RecordData DelayedNamespaceRecord; for (NamespaceDecl *NS : DelayedNamespace) { uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS); - uint64_t VisibleOffset = WriteDeclContextVisibleBlock(Context, NS); + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset); // Write the offset relative to current block. if (LexicalOffset) @@ -5951,9 +6094,13 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { if (VisibleOffset) VisibleOffset -= DeclTypesBlockStartOffset; + if (ModuleLocalOffset) + ModuleLocalOffset -= DeclTypesBlockStartOffset; + AddDeclRef(NS, DelayedNamespaceRecord); DelayedNamespaceRecord.push_back(LexicalOffset); DelayedNamespaceRecord.push_back(VisibleOffset); + DelayedNamespaceRecord.push_back(ModuleLocalOffset); } // The process of writing lexical and visible block for delayed namespace @@ -6033,6 +6180,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); UpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_MODULE_LOCAL_VISIBLE)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); + ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + // And a visible updates block for the translation unit. WriteDeclContextVisibleUpdate(Context, TU); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 3b357f3c50dad..7a494cfe1ac64 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2068,6 +2068,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; if (Writer.isGeneratingReducedBMI() && isa(DC) && cast(DC)->isFromExplicitGlobalModule()) { @@ -2078,12 +2079,13 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { } else { LexicalOffset = Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC); - VisibleOffset = - Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC); + Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC, + VisibleOffset, ModuleLocalOffset); } Record.AddOffset(LexicalOffset); Record.AddOffset(VisibleOffset); + Record.AddOffset(ModuleLocalOffset); } const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) { @@ -2438,6 +2440,7 @@ void ASTWriter::WriteDeclAbbrevs() { // DC Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_RECORD @@ -2490,6 +2493,7 @@ void ASTWriter::WriteDeclAbbrevs() { // DC Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_PARM_VAR @@ -2827,6 +2831,11 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); DeclContextVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_MODULE_LOCAL_VISIBLE)); + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); + DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS)); Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp index d69db40062dae..54ec6aa61ec37 100644 --- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp +++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp @@ -62,8 +62,8 @@ void test_late() { not_exported = 1; #ifndef IMPLEMENTATION - // expected-error@-2 {{declaration of 'not_exported' must be imported from module 'A' before it is required}} - // expected-note@p2.cpp:19 {{declaration here is not visible}} + // expected-error@-2 {{use of undeclared identifier 'not_exported'; did you mean 'exported'?}} + // expected-note@p2.cpp:18 {{'exported' declared here}} #endif internal = 1; diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm index 19761fb3359ce..5a497304201dc 100644 --- a/clang/test/CXX/module/basic/basic.link/p2.cppm +++ b/clang/test/CXX/module/basic/basic.link/p2.cppm @@ -62,12 +62,11 @@ import M; void use_from_module_impl() { external_linkage_fn(); - module_linkage_fn(); // expected-error {{declaration of 'module_linkage_fn' must be imported}} + module_linkage_fn(); // expected-error {{use of undeclared identifier 'module_linkage_fn'}} internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}} (void)external_linkage_class{}; (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} - // expected-note@M.cppm:9 {{declaration here is not visible}} // expected-note@M.cppm:10 {{declaration here is not visible}} (void)external_linkage_var; (void)module_linkage_var; // expected-error {{undeclared identifier}} diff --git a/clang/test/CXX/module/module.import/p2.cpp b/clang/test/CXX/module/module.import/p2.cpp index 6b8e32f746b62..0ad3bc815beac 100644 --- a/clang/test/CXX/module/module.import/p2.cpp +++ b/clang/test/CXX/module/module.import/p2.cpp @@ -23,10 +23,7 @@ export A f(); //--- Use.cpp import M; void test() { - A a; // expected-error {{definition of 'A' must be imported from module 'M' before it is required}} - // expected-error@-1 {{definition of 'A' must be imported from module 'M' before it is required}} expected-error@-1 {{}} - // expected-note@impl.cppm:2 {{declaration here is not visible}} - // expected-note@impl.cppm:2 {{definition here is not reachable}} expected-note@impl.cppm:2 {{}} + A a; // expected-error {{unknown type name 'A'}} } //--- UseInPartA.cppm @@ -40,10 +37,7 @@ void test() { export module B; import M; void test() { - A a; // expected-error {{declaration of 'A' must be imported from module 'M'}} - // expected-error@-1 {{definition of 'A' must be imported from module 'M'}} expected-error@-1 {{}} - // expected-note@impl.cppm:2 {{declaration here is not visible}} - // expected-note@impl.cppm:2 {{definition here is not reachable}} expected-note@impl.cppm:2 {{}} + A a; // expected-error {{unknown type name 'A'}} } //--- Private.cppm diff --git a/clang/test/CXX/module/module.interface/p7.cpp b/clang/test/CXX/module/module.interface/p7.cpp index 1572390f0d289..cff5df91e43d4 100644 --- a/clang/test/CXX/module/module.interface/p7.cpp +++ b/clang/test/CXX/module/module.interface/p7.cpp @@ -57,12 +57,10 @@ void test() { void test2() { auto a = E1::e1; // OK, namespace-scope name E1 is visible and e1 is reachable auto b = e1; // OK, namespace-scope name e1 is visible - auto c = E2::e2; // expected-error {{declaration of 'E2' must be imported from module}} - // expected-note@* {{declaration here is not visible}} - auto d = e2; // should be error, namespace-scope name e2 is not visible + auto c = E2::e2; // expected-error {{use of undeclared identifier 'E2'}} + auto d = e2; // expected-error {{use of undeclared identifier 'e2'}} auto e = E2U::e2; // OK, namespace-scope name E2U is visible and E2::e2 is reachable - auto f = E3::e3; // expected-error {{declaration of 'E3' must be imported from module 'p7' before it is required}} - // expected-note@* {{declaration here is not visible}} - auto g = e3; // should be error, namespace-scope name e3 is not visible + auto f = E3::e3; // expected-error {{use of undeclared identifier 'E3'}} + auto g = e3; // expected-error {{use of undeclared identifier 'e3'}} auto h = decltype(func())::e3; // OK, namespace-scope name f is visible and E3::e3 is reachable } diff --git a/clang/test/CXX/module/module.reach/p5.cpp b/clang/test/CXX/module/module.reach/p5.cpp index 9c498a260530f..947fd082553ec 100644 --- a/clang/test/CXX/module/module.reach/p5.cpp +++ b/clang/test/CXX/module/module.reach/p5.cpp @@ -14,5 +14,4 @@ export using Y = X; export module B; import A; Y y; // OK, definition of X is reachable -X x; // expected-error {{declaration of 'X' must be imported from module 'A' before it is required}} - // expected-note@* {{declaration here is not visible}} +X x; // expected-error {{unknown type name 'X'}} diff --git a/clang/test/Modules/Reachability-template-default-arg.cpp b/clang/test/Modules/Reachability-template-default-arg.cpp index 35c647d0d344b..a7da86b8cc2d5 100644 --- a/clang/test/Modules/Reachability-template-default-arg.cpp +++ b/clang/test/Modules/Reachability-template-default-arg.cpp @@ -21,6 +21,5 @@ struct A { import template_default_arg; void bar() { A<> a0; - A a1; // expected-error {{declaration of 't' must be imported from module 'template_default_arg' before it is required}} - // expected-note@* {{declaration here is not visible}} + A a1; // expected-error {{use of undeclared identifier 't'}} } diff --git a/clang/test/Modules/cxx20-10-1-ex2.cpp b/clang/test/Modules/cxx20-10-1-ex2.cpp index fc61d89926d44..8611d6d64c851 100644 --- a/clang/test/Modules/cxx20-10-1-ex2.cpp +++ b/clang/test/Modules/cxx20-10-1-ex2.cpp @@ -78,8 +78,7 @@ int &c = n; // OK //--- std10-1-ex2-tu6.cpp import B; // error, n is module-local and this is not a module. -int &c = n; // expected-error {{declaration of 'n' must be imported}} - // expected-note@* {{declaration here is not visible}} +int &c = n; // expected-error {{use of undeclared identifier 'n'}} //--- std10-1-ex2-tu7.cpp // expected-no-diagnostics diff --git a/clang/test/Modules/deduction-guide3.cppm b/clang/test/Modules/deduction-guide3.cppm index 1165dd40bcfb8..f7990004cec7c 100644 --- a/clang/test/Modules/deduction-guide3.cppm +++ b/clang/test/Modules/deduction-guide3.cppm @@ -22,8 +22,6 @@ Templ(T t) -> Templ; //--- Use.cpp import Templ; void func() { - Templ t(5); // expected-error {{declaration of 'Templ' must be imported from module 'Templ' before it is required}} - // expected-error@-1 {{unknown type name 'Templ'}} - // expected-note@Templ.cppm:3 {{declaration here is not visible}} + Templ t(5); // expected-error {{unknown type name 'Templ'}} } diff --git a/clang/test/Modules/module-local-with-templates.cppm b/clang/test/Modules/module-local-with-templates.cppm new file mode 100644 index 0000000000000..87955bdd3f99e --- /dev/null +++ b/clang/test/Modules/module-local-with-templates.cppm @@ -0,0 +1,79 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a-part.cppm -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// +// Test again with reduced BMI +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a-part.cppm -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify + + +//--- a.cppm +export module a; + +constexpr int x = 43; + +export constexpr int f() { return x; } + +export template +constexpr T g() { + return x; +} + +namespace nn { + +constexpr int x = 88; + +export constexpr int f() { return x; } + +export template +constexpr T g() { + return x; +} +} + +//--- use.cc +// expected-no-diagnostics +import a; + +static_assert(f() == 43, ""); + +constexpr int x = 99; + +static_assert(g() == 43, ""); + +static_assert(x == 99, ""); + +namespace nn { +static_assert(f() == 88, ""); + +constexpr int x = 1000; + +static_assert(g() == 88, ""); + +static_assert(x == 1000, ""); + +} + +//--- a-part.cppm +module a:impl; +import a; + +static_assert(x == 43, ""); + +constexpr int x = 1000; // expected-error {{redefinition of 'x'}} + // expected-note@* {{previous definition is here}} + +//--- a.cc +module a; + +static_assert(x == 43, ""); + +constexpr int x = 1000; // expected-error {{redefinition of 'x'}} + // expected-note@* {{previous definition is here}} + diff --git a/clang/test/Modules/pr90154.cppm b/clang/test/Modules/pr90154.cppm new file mode 100644 index 0000000000000..d626646fbc488 --- /dev/null +++ b/clang/test/Modules/pr90154.cppm @@ -0,0 +1,25 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// +// Test again with reduced BMI +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify + +//--- a.cppm +export module a; +int b = 99; +namespace a { int a = 43; } + +//--- use.cc +// expected-no-diagnostics +import a; + +namespace a { + double a = 43.0; +} + +int b = 883; diff --git a/clang/unittests/AST/ExternalASTSourceTest.cpp b/clang/unittests/AST/ExternalASTSourceTest.cpp index 8e1bde1247f66..ad209604971f4 100644 --- a/clang/unittests/AST/ExternalASTSourceTest.cpp +++ b/clang/unittests/AST/ExternalASTSourceTest.cpp @@ -68,7 +68,8 @@ TEST(ExternalASTSourceTest, FailedLookupOccursOnce) { TestSource(unsigned &Calls) : Calls(Calls) {} bool FindExternalVisibleDeclsByName(const DeclContext *, - DeclarationName Name) override { + DeclarationName Name, + Module *NamedModule) override { if (Name.getAsString() == "j") ++Calls; return false; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h index d5c68a436e090..7403b79be6cc0 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h @@ -71,8 +71,9 @@ class ExternalASTSourceWrapper : public clang::ExternalSemaSource { } bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { - return m_Source->FindExternalVisibleDeclsByName(DC, Name); + clang::DeclarationName Name, + clang::Module *NamedModule) override { + return m_Source->FindExternalVisibleDeclsByName(DC, Name, NamedModule); } bool LoadExternalSpecializations(const clang::Decl *D, @@ -388,9 +389,10 @@ class SemaSourceWithPriorities : public clang::ExternalSemaSource { } bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { + clang::DeclarationName Name, + clang::Module *NamedModule) override { for (size_t i = 0; i < Sources.size(); ++i) - if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name)) + if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule)) return true; return false; } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp index e41efdd3f61c7..94ce867ef4a0f 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp @@ -99,7 +99,8 @@ void ClangASTSource::StartTranslationUnit(ASTConsumer *Consumer) { // The core lookup interface. bool ClangASTSource::FindExternalVisibleDeclsByName( - const DeclContext *decl_ctx, DeclarationName clang_decl_name) { + const DeclContext *decl_ctx, DeclarationName clang_decl_name, + clang::Module *NamedModule) { if (!m_ast_context) { SetNoExternalVisibleDeclsForName(decl_ctx, clang_decl_name); return false; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h index 83c910477acc8..6dc4ecc94e0ed 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h @@ -84,7 +84,8 @@ class ClangASTSource : public clang::ExternalASTSource, /// \return /// Whatever SetExternalVisibleDeclsForName returns. bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override; + clang::DeclarationName Name, + clang::Module *NamedModule) override; /// Enumerate all Decls in a given lexical context. /// @@ -212,8 +213,9 @@ class ClangASTSource : public clang::ExternalASTSource, ClangASTSourceProxy(ClangASTSource &original) : m_original(original) {} bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { - return m_original.FindExternalVisibleDeclsByName(DC, Name); + clang::DeclarationName Name, + clang::Module *NamedModule) override { + return m_original.FindExternalVisibleDeclsByName(DC, Name, NamedModule); } void FindExternalLexicalDecls( diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp index e746e6afe39be..bf4537e69eb63 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp @@ -50,7 +50,8 @@ void ClangExternalASTSourceCallbacks::FindExternalLexicalDecls( } bool ClangExternalASTSourceCallbacks::FindExternalVisibleDeclsByName( - const clang::DeclContext *DC, clang::DeclarationName Name) { + const clang::DeclContext *DC, clang::DeclarationName Name, + clang::Module *NamedModule) { llvm::SmallVector decls; // Objective-C methods are not added into the LookupPtr when they originate // from an external source. SetExternalVisibleDeclsForName() adds them. diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h index 6bd18186a567d..d2e9c1552fd38 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h @@ -38,7 +38,8 @@ class ClangExternalASTSourceCallbacks : public clang::ExternalASTSource { llvm::SmallVectorImpl &Result) override; bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override; + clang::DeclarationName Name, + clang::Module *NamedModule) override; void CompleteType(clang::TagDecl *tag_decl) override; diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp index 96a259b811b5e..24fc5bb2c047f 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp @@ -30,7 +30,8 @@ class lldb_private::AppleObjCExternalASTSource : m_decl_vendor(decl_vendor) {} bool FindExternalVisibleDeclsByName(const clang::DeclContext *decl_ctx, - clang::DeclarationName name) override { + clang::DeclarationName name, + clang::Module *NamedModule) override { Log *log(GetLog( LLDBLog::Expressions)); // FIXME - a more appropriate log channel? From d1d25641f4cb87ab2c07a4136ba1cec4fb6cf578 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 15 Jan 2025 08:41:20 +0100 Subject: [PATCH 37/82] [clang][bytecode] Handle UETT_PtrAuthTypeDiscriminator (#122941) --- clang/lib/AST/ByteCode/Compiler.cpp | 10 ++++++++++ clang/test/SemaCXX/ptrauth-type-discriminator.cpp | 3 +++ 2 files changed, 13 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index a5dfaaf319655..c6e2a1e50a2aa 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2141,6 +2141,16 @@ bool Compiler::VisitUnaryExprOrTypeTraitExpr( return this->emitConst(ASTCtx.toCharUnitsFromBits(Bits).getQuantity(), E); } + if (Kind == UETT_PtrAuthTypeDiscriminator) { + if (E->getArgumentType()->isDependentType()) + return this->emitInvalid(E); + + return this->emitConst( + const_cast(ASTCtx).getPointerAuthTypeDiscriminator( + E->getArgumentType()), + E); + } + return false; } diff --git a/clang/test/SemaCXX/ptrauth-type-discriminator.cpp b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp index 685ca1f03fddd..f5b71ed86acf7 100644 --- a/clang/test/SemaCXX/ptrauth-type-discriminator.cpp +++ b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s // RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fexperimental-new-constant-interpreter %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fexperimental-new-constant-interpreter %s + // RUN: not %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only %s 2>&1 | FileCheck %s // CHECK: this target does not support pointer authentication From 929eb500d4c9b3fff0693c49fd55c8093dc1ad62 Mon Sep 17 00:00:00 2001 From: ziereis <44057120+ziereis@users.noreply.github.com> Date: Wed, 15 Jan 2025 09:12:39 +0100 Subject: [PATCH 38/82] [mlir] Rewrites for I2 to I8 signed and unsigned extension (#121298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds rewrites for i2 to i8 signed and unsigned extension, similar to the ones that already exist for i4 to i8 conversion. I use this for i6 quantized models, and this gives me roughly a 2x speedup for an i6 4096x4096 dequantization-matmul on an AMD 5950x. I didn't add the rewrite for i8 to i2 truncation because I currently don't use it, but if this is needed, I can add it as well. --------- Co-authored-by: Andrzej WarzyÅ„ski --- .../Transforms/VectorEmulateNarrowType.cpp | 235 +++++++++++++----- .../Vector/vector-rewrite-narrow-types.mlir | 188 +++++++++++++- 2 files changed, 357 insertions(+), 66 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index d04f302200519..a674a59009181 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -1090,15 +1090,20 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter, unsigned srcElemBitwidth = subByteVecType.getElementTypeBitWidth(); unsigned dstElemBitwidth = dstType.getElementTypeBitWidth(); - // Only {s}i4 -> (size_of({{s}i/f}) >= 8) are supported for now. - if (srcElemBitwidth != 4 || dstElemBitwidth < 8 || - (dstElemBitwidth % srcElemBitwidth) != 0) - return rewriter.notifyMatchFailure(op, "Not a supported aligned case"); + if (dstElemBitwidth < 8) + return rewriter.notifyMatchFailure( + op, "the bitwidth of dstType must be greater than or equal to 8"); + if (dstElemBitwidth % srcElemBitwidth != 0) + return rewriter.notifyMatchFailure(op, "unaligned cases are not supported"); + if (srcElemBitwidth != 2 && srcElemBitwidth != 4) + return rewriter.notifyMatchFailure( + op, "only src bitwidth of 2 or 4 is supported at this moment"); - const int numSrcElemsPerDestElem = dstElemBitwidth / srcElemBitwidth; - if ((subByteVecType.getShape().back() % numSrcElemsPerDestElem) != 0) + const int numSrcElemsPerByte = 8 / srcElemBitwidth; + if ((subByteVecType.getShape().back() % numSrcElemsPerByte) != 0) return rewriter.notifyMatchFailure( - op, "Not an even number of i4 elements in trailing dim"); + op, "the trailing dimension of the input vector of sub-bytes must be a " + "multiple of 8 / "); return success(); } @@ -1179,70 +1184,166 @@ Value BitCastRewriter::genericRewriteStep( return runningResult; } -/// Rewrite the i4 -> i8 signed extension into a sequence of shuffles and -/// bitwise ops that take advantage of high-level information to avoid leaving -/// LLVM to scramble with peephole optimizations. -static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc, - Value srcValue) { - VectorType srcVecType = cast(srcValue.getType()); - assert(srcVecType.getElementType().isSignlessInteger(4) && - "Expected i4 type"); +/// Bitcasts the aligned `subByteVec` vector to a vector of i8. +/// Where aligned means it satisfies the alignedConversionPreconditions. +/// +/// Example: +/// vector<16x16xi2> -> vector<16x4xi8> +/// vector<16x16xi4> -> vector<16x8xi8> +static Value bitcastSubByteVectorToI8(PatternRewriter &rewriter, Location loc, + Value subByteVec) { + auto srcVecType = cast(subByteVec.getType()); + int64_t srcBitwidth = srcVecType.getElementType().getIntOrFloatBitWidth(); + assert(8 % srcBitwidth == 0 && + "Unsupported sub-byte type (not a divisor of i8)"); + int64_t numSrcElemsPerByte = 8 / srcBitwidth; + SmallVector vecShape(srcVecType.getShape()); + // Adjust last dimension of the vector, so the total size remains the same. + vecShape.back() = vecShape.back() / numSrcElemsPerByte; + auto i8VecType = VectorType::get(vecShape, rewriter.getI8Type()); + return rewriter.create(loc, i8VecType, subByteVec); +} - // 1. Generate a bitcast vector -> vector. - SmallVector i8VecShape = llvm::to_vector(srcVecType.getShape()); - constexpr int64_t i4Toi8BitwidthFactor = 2; - i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor; - auto i8VecType = VectorType::get(i8VecShape, rewriter.getI8Type()); - Value i8Vector = rewriter.create(loc, i8VecType, srcValue); +/// Extracts a signed N-bit sequence from each element of a vector of bytes, +/// starting at the specified bit index. +/// The `bitIdx` starts at 0 from the LSB and moves to the left. +/// +/// Example for a single element: +/// Extract numBits=2 starting at bitIdx=2 +/// src = [0 | 1 | 0 | 1 | 1 | 1 | 1 | 0] +/// indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] +/// target = [. . . . ^ ^ . .] +/// +/// The target sequence is [11](decimal=-1) as signed 2-bit integer. +/// So the result should be [11 11 11 11](decimal=-1) as signed 8-bit integer. +/// +/// src = [01 01 11 10] +/// shl = arith.shl(src, 4) -> [11 10 00 00] +/// result = arith.shrsi(shl, 6) -> [11 11 11 11] +static Value extractNBitsPerByteAndSignExtendToI8(PatternRewriter &rewriter, + Location loc, Value src, + int bitIdx, int numBits) { + auto srcType = cast(src.getType()); + Value shl = src; + int8_t bitsToShiftLeft = 8 - numBits - bitIdx; + assert(bitIdx >= 0 && bitsToShiftLeft >= 0 && numBits > 0 && numBits <= 8 && + "Invalid bitIdx range"); + if (bitsToShiftLeft != 0) { + Value shiftLeftValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftLeft)); + shl = rewriter.create(loc, src, shiftLeftValues); + } - // 2. Extend i4 elements to i8 elements using shifts. Low i4 elemens of each - // byte are place in one vector and the high i4 elements in another vector. - constexpr int8_t bitsToShift = 4; - auto shiftValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, bitsToShift)); - Value shl = rewriter.create(loc, i8Vector, shiftValues); - Value low = rewriter.create(loc, shl, shiftValues); - Value high = rewriter.create(loc, i8Vector, shiftValues); + int8_t bitsToShiftRight = 8 - numBits; + Value shiftRightValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftRight)); + Value shr = rewriter.create(loc, shl, shiftRightValues); + return shr; +} - // 3. Interleave low and high i8 elements. - return rewriter.create(loc, low, high); +/// Extracts an unsigned N-bit sequence from each element of a vector of bytes, +/// starting at the specified bit index. +/// The `bitIdx` starts at 0 from the LSB and moves to the left. +/// +/// Example for a single element: +/// Extract numBits=2 starting at bitIdx=2 +/// src = [0 | 1 | 0 | 1 | 1 | 0 | 1 | 0] +/// indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] +/// target = [. . . . ^ ^ . .] +/// +/// The target sequence is [10](decimal=2) as unsigned 2-bit integer. +/// So the result should be [00 00 00 10](decimal=2) as unsigned 8-bit integer. +/// +/// src = [01 01 10 10] +/// mask = [00 00 00 11] +/// shr = arith.shrui(src, 2) = [00 01 01 10] +/// result = arith.andi(shr, mask) = [00 00 00 10] +/// NOTE: Similarly to extractNBitsPerByteAndSignExtendToI8, this could be +/// achieved by using arith::ShLIOp + arith::ShRUIOp instead of the masking. +/// However, by using arith::ShRUIOp + arith::AndIOp, we are eliminating shift +/// left when the index is 0. +static Value extractNBitsPerByteAndExtendToI8(PatternRewriter &rewriter, + Location loc, Value src, + int bitIdx, int numBits) { + assert(bitIdx >= 0 && bitIdx <= 8 - numBits && numBits > 0 && numBits <= 8 && + "Invalid bitIdx range"); + auto srcType = cast(src.getType()); + int8_t bitsToShiftRight = bitIdx; + Value shr = src; + if (bitsToShiftRight != 0) { + Value shiftRightValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftRight)); + shr = rewriter.create(loc, src, shiftRightValues); + } + if (bitIdx + numBits == 8) { + return shr; + } + uint8_t lowBitsMask = (1 << numBits) - 1; + Value lowBitsMaskValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, lowBitsMask)); + return rewriter.create(loc, shr, lowBitsMaskValues); } -/// Rewrite the i4 -> i8 unsigned extension into a sequence of shuffles and -/// bitwise ops that take advantage of high-level information to avoid leaving -/// LLVM to scramble with peephole optimizations. -static Value rewriteI4ToI8UnsignedExt(PatternRewriter &rewriter, Location loc, - Value srcValue) { - VectorType srcVecType = cast(srcValue.getType()); +using ExtractNBitsFn = + std::function; + +/// Rewrite the i4 -> i8 extension into a sequence of shuffles and +/// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. +static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc, + Value srcValue, const ExtractNBitsFn &extFn) { + auto srcVecType = cast(srcValue.getType()); assert(srcVecType.getElementType().isSignlessInteger(4) && "Expected i4 type"); // 1. Generate a bitcast vector -> vector. - SmallVector i8VecShape = llvm::to_vector(srcVecType.getShape()); - constexpr int64_t i4Toi8BitwidthFactor = 2; - i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor; - auto i8VecType = VectorType::get(i8VecShape, rewriter.getI8Type()); - Value i8Vector = rewriter.create(loc, i8VecType, srcValue); - - // 2 Extend the i4 elements using shifts & masking. Low i4 elements of each - // byte are placed in one vector and the high i4 elements in another vector. - constexpr uint8_t lowBitsMask = 15; // Equivalent to [00001111] bit mask - auto lowBitsMaskValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, lowBitsMask)); - Value low = rewriter.create(loc, i8VecType, i8Vector, - lowBitsMaskValues); - constexpr int8_t highBitsToShift = 4; - auto highShiftValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, highBitsToShift)); - Value high = rewriter.create(loc, i8Vector, highShiftValues); + Value i8Vector = bitcastSubByteVectorToI8(rewriter, loc, srcValue); + + // 2. Extend i4 elements to i8 elements. Low i4 elemens of each + // byte are place in one vector and the high i4 elements in another vector. + Value low = extFn(rewriter, loc, i8Vector, 0, 4); + Value high = extFn(rewriter, loc, i8Vector, 4, 4); // 3. Interleave low and high i8 elements. return rewriter.create(loc, low, high); } +/// Rewrite the i2 -> i8 extension into a sequence of shuffles and +/// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. +static Value rewriteI2ToI8Ext(PatternRewriter &rewriter, Location loc, + Value srcValue, const ExtractNBitsFn &extFn) { + VectorType srcVecType = cast(srcValue.getType()); + assert(srcVecType.getElementType().isSignlessInteger(2) && + "Expected i2 type"); + + // 1. Generate a bitcast vector -> vector. + Value i8Vector = bitcastSubByteVectorToI8(rewriter, loc, srcValue); + + // 2. Extract each i2 element + // Positon 0 (bits 0-1) + Value vec0 = extFn(rewriter, loc, i8Vector, 0, 2); + // Position 1 (bits 2-3) + Value vec1 = extFn(rewriter, loc, i8Vector, 2, 2); + // Position 2 (bits 4-5) + Value vec2 = extFn(rewriter, loc, i8Vector, 4, 2); + // Position 3 (bits 6-7) + Value vec3 = extFn(rewriter, loc, i8Vector, 6, 2); + + // 3. Interleave all 4 elements by first interleaving + // even elements and then odd + // vec0 = [0,0,0,0],... + // vec1 = [1,1,1,1],... + // vec2 = [2,2,2,2],... + // vec3 = [3,3,3,3],... + // 02 = [0,2,0,2,0,2,0,2],... + // 13 = [1,3,1,3,1,3,1,3],... + // 0213 = [0,1,2,3,...],... + Value interleave02 = rewriter.create(loc, vec0, vec2); + Value interleave13 = rewriter.create(loc, vec1, vec3); + return rewriter.create(loc, interleave02, interleave13); +} + /// Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise -/// ops that take advantage of high-level information to avoid leaving LLVM to -/// scramble with peephole optimizations. +/// ops to avoid leaving LLVM to scramble with peephole optimizations. static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc, Value srcValue) { VectorType srcVecType = cast(srcValue.getType()); @@ -1443,13 +1544,19 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern { return failure(); // Perform the rewrite. + Location loc = conversionOp.getLoc(); + const auto &extFn = isSigned ? extractNBitsPerByteAndSignExtendToI8 + : extractNBitsPerByteAndExtendToI8; Value subByteExt; - if (isSigned) { - subByteExt = - rewriteI4ToI8SignedExt(rewriter, conversionOp.getLoc(), srcValue); - } else { - subByteExt = - rewriteI4ToI8UnsignedExt(rewriter, conversionOp.getLoc(), srcValue); + switch (srcVecType.getElementType().getIntOrFloatBitWidth()) { + case 2: + subByteExt = rewriteI2ToI8Ext(rewriter, loc, srcValue, extFn); + break; + case 4: + subByteExt = rewriteI4ToI8Ext(rewriter, loc, srcValue, extFn); + break; + default: + return failure(); } // Finalize the rewrite. @@ -1490,6 +1597,10 @@ struct RewriteAlignedSubByteIntTrunc : OpRewritePattern { if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp))) return failure(); + // TODO: Add support for truncating to i2. + if (dstVecType.getElementType().getIntOrFloatBitWidth() == 2) + return failure(); + // Check general alignment preconditions. We invert the src/dst type order // to reuse the existing precondition logic. if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType, diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir index 210025e30d7db..8d28f248e392d 100644 --- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir +++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir @@ -193,6 +193,25 @@ func.func @f3ext(%a: vector<5xi8>) -> vector<8xi17> { return %1 : vector<8xi17> } + +// Negative test - the trailing dim 1 is not a multiple of 2 (i.e. 8 / 4). +// CHECK-LABEL: func.func @unaligned_extsi_i4_to_i8( +func.func @unaligned_extsi_i4_to_i8(%a: vector<1xi4>) -> vector<1xi8> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.extsi %[[IN:.*]] : vector<1xi4> to vector<1xi8> + %0 = arith.extsi %a : vector<1xi4> to vector<1xi8> + return %0 : vector<1xi8> +} + +// Negative test - the trailing dim 2 is not a multiple of 4 (i.e. 8 / 2). +// CHECK-LABEL: func.func @unaligned_extsi_i2_to_i8( +func.func @unaligned_extsi_i2_to_i8(%a: vector<2xi2>) -> vector<2xi8> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.extsi %[[IN:.*]] : vector<2xi2> to vector<2xi8> + %0 = arith.extsi %a : vector<2xi2> to vector<2xi8> + return %0 : vector<2xi8> +} + // CHECK-LABEL: func.func @aligned_extsi_i4_to_i8( func.func @aligned_extsi_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { @@ -206,6 +225,31 @@ func.func @aligned_extsi_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { return %0 : vector<8xi8> } +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i8( +func.func @aligned_extsi_i2_to_i8(%a: vector<8xi2>) -> vector<8xi8> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi8> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[RESULT:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> + %0 = arith.extsi %a : vector<8xi2> to vector<8xi8> + return %0 : vector<8xi8> +} + // CHECK-LABEL: func.func @aligned_extsi_i4_to_i32( func.func @aligned_extsi_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { @@ -220,8 +264,34 @@ func.func @aligned_extsi_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { return %0 : vector<8xi32> } -// CHECK-LABEL: func.func @aligned_extsi_2d( -func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i32( +func.func @aligned_extsi_i2_to_i32(%a: vector<8xi2>) -> vector<8xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi32> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> +// CHECK: %[[RESULT:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> + %0 = arith.extsi %a : vector<8xi2> to vector<8xi32> + return %0 : vector<8xi32> +} + +// CHECK-LABEL: func.func @aligned_extsi_i4_to_i32_2d( +func.func @aligned_extsi_i4_to_i32_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> @@ -234,6 +304,32 @@ func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { return %0 : vector<8x32xi32> } +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i32_2d( +func.func @aligned_extsi_i2_to_i32_2d(%a: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<8x8xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<8x8xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<8x8xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi2> to vector<8x8xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<8x8xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<8x8xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<8x16xi8> +// CHECK: %[[RESULT:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extsi %a : vector<8x32xi2> to vector<8x32xi32> + return %0 : vector<8x32xi32> +} + // CHECK-LABEL: func.func @aligned_trunci_i8_to_i4( func.func @aligned_trunci_i8_to_i4(%a: vector<8xi8>) -> vector<8xi4> { @@ -292,6 +388,13 @@ func.func @aligned_trunci_nd(%a: vector<3x8x32xi32>) -> vector<3x8x32xi4> { return %0 : vector<3x8x32xi4> } +func.func @aligned_trunci_i8_to_i2_no_match(%a: vector<8xi8>) -> vector<8xi2> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.trunci %[[IN:.*]] : vector<8xi8> to vector<8xi2> + %0 = arith.trunci %a : vector<8xi8> to vector<8xi2> + return %0 : vector<8xi2> +} + // CHECK-LABEL: func.func @aligned_extui_i4_to_i8( func.func @aligned_extui_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { @@ -305,6 +408,31 @@ func.func @aligned_extui_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { return %0 : vector<8xi8> } +// CHECK-LABEL: func.func @aligned_extui_i2_to_i8( +func.func @aligned_extui_i2_to_i8(%a: vector<8xi2>) -> vector<8xi8> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi8> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[RESULT:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> + %0 = arith.extui %a : vector<8xi2> to vector<8xi8> + return %0 : vector<8xi8> +} + // CHECK-LABEL: func.func @aligned_extui_i4_to_i32( func.func @aligned_extui_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { @@ -319,8 +447,34 @@ func.func @aligned_extui_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { return %0 : vector<8xi32> } -// CHECK-LABEL: func.func @aligned_extui_2d( -func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-LABEL: func.func @aligned_extui_i2_to_i32( +func.func @aligned_extui_i2_to_i32(%a: vector<8xi2>) -> vector<8xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi32> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> +// CHECK: %[[RESULT:.*]] = arith.extui %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> + %0 = arith.extui %a : vector<8xi2> to vector<8xi32> + return %0 : vector<8xi32> +} + +// CHECK-LABEL: func.func @aligned_extui_i4_to_i32_2d( +func.func @aligned_extui_i4_to_i32_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK-SAME: %[[VAL_0:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> // CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<8x16xi8> @@ -333,6 +487,32 @@ func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { return %0 : vector<8x32xi32> } +// CHECK-LABEL: func.func @aligned_extui_i2_to_i32_2d( +func.func @aligned_extui_i2_to_i32_2d(%a: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<8x8xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<8x8xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<8x8xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<8x8xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi2> to vector<8x8xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<8x8xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<8x8xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<8x16xi8> +// CHECK: %[[RESULT:.*]] = arith.extui %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extui %a : vector<8x32xi2> to vector<8x32xi32> + return %0 : vector<8x32xi32> +} + // CHECK-LABEL: func.func @aligned_sitofp( func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { From 8ac00ca4867835cacaf013f5c442658b9b1bce38 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 15 Jan 2025 08:19:54 +0000 Subject: [PATCH 39/82] [X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit source lanes are already in place (#122919) Allows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half Fixes #116815 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 34 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 56 +- llvm/test/CodeGen/X86/trunc-vector-width.ll | 12 +- .../vector-interleaved-load-i8-stride-4.ll | 1416 ++-- .../vector-interleaved-load-i8-stride-5.ll | 2120 ++--- .../vector-interleaved-load-i8-stride-6.ll | 16 +- .../vector-interleaved-load-i8-stride-7.ll | 2521 +++--- .../vector-interleaved-load-i8-stride-8.ll | 7145 +++++++---------- .../vector-interleaved-store-i8-stride-5.ll | 696 +- .../vector-interleaved-store-i8-stride-7.ll | 1328 ++- .../X86/vector-shuffle-combining-avx2.ll | 4 +- .../CodeGen/X86/x86-interleaved-access.ll | 200 +- 13 files changed, 7061 insertions(+), 8493 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4152ff4a816c..90e3e15b1fb46 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); - // If this is a unary shuffle (assume that the 2nd operand is + // If this is an unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we // are better off extracting the upper half of 1 operand and using a // narrow shuffle. if (EltWidth == 64 && V2.isUndef()) return SDValue(); + // If this is an unary vXi8 shuffle with inplace halves, then perform as + // full width pshufb, and then merge. + if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1) + return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 9642e5e4c9f86..26af46263c0e2 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi ; ; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin ; ; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index f0f02f1ed890a..ec442c185706c 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll index bc6969c5cd37a..42cc624b5a535 100644 --- a/llvm/test/CodeGen/X86/trunc-vector-width.ll +++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll @@ -4,14 +4,16 @@ define void @test(ptr %a0) #0 { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-NEXT: vpextrb $1, %xmm0, (%rax) ; CHECK-NEXT: vpextrb $4, %xmm0, (%rax) ; CHECK-NEXT: vpextrb $8, %xmm0, (%rax) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %load = load <64 x i8>, ptr %a0, align 1 %shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 01181d4b21d9d..abef980277ece 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -849,146 +849,122 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf16: @@ -1446,228 +1422,198 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-FP-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FP-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -2696,517 +2642,379 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $168, %rsp -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-NEXT: addq $168, %rsp +; AVX2-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $168, %rsp -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-FP-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-FP-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FP-NEXT: addq $168, %rsp +; AVX2-FP-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $168, %rsp -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FCP-NEXT: addq $168, %rsp +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index e7bb02db62753..ac14f55e3f0ed 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -6395,203 +6395,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride5_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper @@ -6600,203 +6600,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride5_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -6805,203 +6805,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride5_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -7010,203 +7010,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -7231,163 +7231,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512BW-NEXT: movl $127, %eax ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7400,36 +7401,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7453,163 +7453,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512BW-FCP-NEXT: movl $127, %eax ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7622,36 +7623,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -7675,163 +7675,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512DQ-BW-NEXT: movl $127, %eax ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7844,36 +7845,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -7897,163 +7897,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512DQ-BW-FCP-NEXT: movl $127, %eax ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -8066,36 +8067,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 9ce685f13e476..f87126a98eea4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -7354,12 +7354,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $40, %rsp -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7608,12 +7608,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $40, %rsp -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7862,12 +7862,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $40, %rsp -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8116,12 +8116,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $40, %rsp -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8370,12 +8370,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i8_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -8606,12 +8606,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -8842,12 +8842,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -9078,12 +9078,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index bea6219b9fbac..5ab09194c5b83 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -12121,414 +12121,399 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm10 ^ (ymm1 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm27 ^ ymm30)) -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm29 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm31 ^ (ymm1 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm21) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm10 ^ (ymm13 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512-FCP-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] +; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm2 & (zmm22 ^ zmm13)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm18 & (zmm23 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm5)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm22)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm23)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm0 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm5 & (zmm23 ^ zmm28)) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25) ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm0 & (ymm4 ^ ymm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm0 & (ymm28 ^ ymm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm10 ^ ymm19)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = ymm24 ^ (ymm21 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm19 ^ (ymm6 & (ymm10 ^ ymm19)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm27 ^ (ymm7 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm11 ^ (ymm12 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm30 ^ (ymm6 & (ymm27 ^ ymm30)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm27 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3],ymm9[4],ymm7[5,6],ymm9[7,8],ymm7[9,10,11],ymm9[12],ymm7[13,14],ymm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm3 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ymm17) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm29 ^ (ymm10 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm11 & (ymm3 ^ ymm2)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm9)) -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6)) -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm16[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm26, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -12961,413 +12946,405 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm11 ^ (ymm1 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: pushq %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm31 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm10 ^ (ymm4 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm31 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm3 ^ (zmm18 & (zmm22 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm5 & (zmm21 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm27 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm27 & (ymm1 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm27 & (ymm28 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm2, %xmm27 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm23 = ymm24 ^ (ymm23 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm10 ^ (ymm12 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm27 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm0 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm11 & (ymm8 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm0 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm0 ^ (ymm14 & (ymm31 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0)) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm23, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FCP-NEXT: popq %rax ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -13743,29 +13720,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -13776,11 +13753,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k3 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -13789,285 +13766,285 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm9 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7} -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm20 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 -; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm18 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm17 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm16 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm16 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm21, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm11, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm11, %xmm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm19, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm21, %zmm21 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm21[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm13, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] @@ -14075,13 +14052,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2} ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero @@ -14094,11 +14071,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -14472,29 +14449,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -14505,11 +14482,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 @@ -14518,315 +14495,315 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm9 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 -; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm17 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm0 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm19, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm13, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm21, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm12 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm6, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 5b607748c5761..99932c0026b23 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -1364,90 +1364,55 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm8, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm9, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm10, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx) +; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx) +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) +; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FCP-NEXT: vmovq %xmm5, (%r11) +; AVX2-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX2-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf8: @@ -2663,182 +2628,97 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm10, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r8) -; AVX2-FCP-NEXT: vmovdqa %xmm12, (%r9) +; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm12, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rax) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf16: @@ -2962,114 +2842,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpmovqd %ymm9, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpmovqd %ymm10, %xmm11 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4 +; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9) +; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r11) +; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r10) +; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -3194,114 +3037,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm10, %xmm11 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r11) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -3426,114 +3232,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11) +; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -3658,114 +3427,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i8>, ptr %in.vec, align 64 @@ -6063,305 +5795,180 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $248, %rsp -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FCP-NEXT: subq $136, %rsp +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm6, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm13[3] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm14[1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FCP-NEXT: addq $248, %rsp +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FCP-NEXT: addq $136, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -6721,231 +6328,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpmovqb %zmm18, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpmovqd %ymm12, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19 +; AVX512-FCP-NEXT: vpmovqd %ymm19, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpmovqb %zmm20, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4 -; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3 ; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm15, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm12, (%r9) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -7305,231 +6867,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpmovqb %zmm18, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm12, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm19, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpmovqb %zmm20, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, (%r9) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7837,214 +7354,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27 -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28 -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpmovqd %ymm12, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vpmovqd %ymm16, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] -; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 -; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13 -; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 +; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm12, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -8352,214 +7824,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm12, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm16, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i8>, ptr %in.vec, align 64 @@ -13109,641 +12536,471 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $904, %rsp # imm = 0x388 -; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FCP-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm14 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm12 +; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm11 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5],ymm15[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm14 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) @@ -13778,7 +13035,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FCP-NEXT: addq $904, %rsp # imm = 0x388 +; AVX2-FCP-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -14534,557 +13791,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: subq $232, %rsp +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512-FCP-NEXT: vpmovqb %zmm28, %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovqd %ymm2, %xmm4 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovqd %ymm3, %xmm12 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm23 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12 -; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25 +; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 +; AVX512-FCP-NEXT: vpmovqd %ymm27, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15 +; AVX512-FCP-NEXT: vpmovqd %ymm28, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512-FCP-NEXT: vpmovqb %zmm30, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm19 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14 ; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm20 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 ; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13 -; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 +; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: addq $232, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -15840,557 +14968,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: subq $232, %rsp +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm28, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovqd %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovqd %ymm3, %xmm12 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm27, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm28, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm30, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: addq $232, %rsp ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -17073,429 +16072,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 -; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 +; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-FCP-NEXT: vpmovqb %zmm18, %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17 -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512BW-FCP-NEXT: vpmovqd %ymm28, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vpmovqd %ymm25, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 ; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 ; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 +; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15 -; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 +; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 +; AVX512BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13 +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16 +; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13 +; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9 -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 ; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12 -; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-FCP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -18178,429 +17105,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm18, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm28, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm25, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 941b18db0931a..f7a44fea5b02b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1185,451 +1185,429 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride5_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero -; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero -; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] -; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero +; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i8_stride5_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero -; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] -; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX2-FP-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] +; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[19,27],zero,zero,zero,ymm7[20,28],zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,ymm8[1,9],zero,zero,zero,ymm8[2,10],zero,zero,zero,ymm8[3,19],zero,zero,zero,ymm8[28,20],zero,zero,zero,ymm8[29,21],zero,zero,zero,ymm8[30,22] -; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,13],zero,zero,zero,ymm5[18,22],zero,zero,zero,ymm5[19,23],zero,zero,zero,ymm5[24,28],zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6],zero,zero,zero,ymm6[3,7],zero,zero,zero,ymm6[8,12],zero,zero,zero,ymm6[9,17],zero,zero,zero,ymm6[22,18],zero,zero,zero,ymm6[23,19],zero,zero,zero,ymm6[24,28] -; AVX2-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] +; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28] +; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i8_stride5_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride5_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride5_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] -; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride5_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero -; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero +; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 +; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 +; AVX512BW-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride5_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] -; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride5_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero -; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQ-BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 98a64ee987f7b..ab968b91153a9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1813,81 +1813,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25] -; AVX2-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX2-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] +; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero +; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero +; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero +; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] -; AVX2-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1895,77 +1893,75 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm7, %ymm4, %ymm7 +; AVX2-FP-NEXT: vmovdqa (%r10), %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero ; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm8, %ymm11 -; AVX2-FP-NEXT: vmovdqa (%r10), %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero -; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero -; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[12,13],zero,zero,zero,zero,zero,xmm4[14,15],zero,zero,zero -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15] -; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero +; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,14,15,4,5],zero,zero,xmm4[14,15,14,15,12],zero,zero,xmm4[15] +; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FP-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -1973,75 +1969,73 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm11[0,8],zero,zero,zero,zero,zero,ymm11[1,9],zero,zero,zero,zero,zero,ymm11[18,26],zero,zero,zero,zero,zero,ymm11[19,27],zero,zero,zero,zero,zero,ymm11[20,28] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[2,10],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero -; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,5,2,6,1,5,2,6] -; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,ymm13[1,5],zero,zero,zero,zero,zero,ymm13[2,6],zero,zero,zero,zero,zero,ymm13[19,23],zero,zero,zero,zero,zero,ymm13[24,28],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm12[1,5],zero,zero,zero,zero,zero,ymm12[2,6],zero,zero,zero,zero,zero,ymm12[19,23],zero,zero,zero,zero,zero,ymm12[24,28],zero,zero,zero,zero,zero,ymm12[25] -; AVX2-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm9[1,5],zero,zero,zero,zero,zero,ymm9[2,6],zero,zero,zero,zero,zero,ymm9[19,23],zero,zero,zero,zero,zero,ymm9[24,28],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm8[1,5],zero,zero,zero,zero,zero,ymm8[2,6],zero,zero,zero,zero,zero,ymm8[19,23],zero,zero,zero,zero,zero,ymm8[24,28],zero,zero,zero,zero,zero,ymm8[25] +; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,14,15,4,5],zero,zero,xmm2[14,15,14,15,12],zero,zero,xmm2[15] -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,14,15,4,5],zero,zero,xmm0[14,15,14,15,12],zero,zero,xmm0[15] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -2049,76 +2043,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10) -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11) -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11) -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8)) -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512-NEXT: vporq %zmm6, %zmm7, %zmm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512-NEXT: vporq %zmm8, %zmm7, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2126,69 +2118,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem) -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2196,76 +2186,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8)) -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-NEXT: vporq %zmm6, %zmm7, %zmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512DQ-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-NEXT: vporq %zmm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2273,69 +2261,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem) -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2343,82 +2329,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] ; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1] +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28] -; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper @@ -2428,72 +2412,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2501,82 +2483,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] ; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2586,72 +2566,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 9e82c84fe5520..ec54b75513582 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -946,10 +946,8 @@ define <2 x i64> @PR116815(<4 x i64> %v0, <4 x i64> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $16, %ymm1, %ymm1 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,2,6,10,14,u,u,u,u,u,u,u,u,16,20,24,28,18,22,26,30,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = [0,4,8,12,2,6,10,14,0,0,0,0,0,0,0,0] -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 3d49edbb7bd8d..3e76bffb77a66 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -483,50 +483,42 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; ; AVX2-LABEL: interleaved_load_vf16_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: @@ -646,76 +638,66 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; From 4c2e4ea18fd0031636994cf81fd03d82f59b7d27 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 15 Jan 2025 00:30:53 -0800 Subject: [PATCH 40/82] [RISCV][llvm-exegesis] Disable pseudo instructions in allowAsBackToBack. (#122986) Prevents crashes trying to encode pseudo instuctions. Tested on HiFive Premier P550. Fixes #122974 --- llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp index d55db9af8a9bd..217b423d7b3f3 100644 --- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -122,6 +122,10 @@ class ExegesisRISCVTarget : public ExegesisTarget { ArrayRef getUnavailableRegisters() const override; + bool allowAsBackToBack(const Instruction &Instr) const override { + return !Instr.Description.isPseudo(); + } + Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, const BitVector &ForbiddenRegs) const override; From c24ce324d56328e4b91c8797ea4935545084303e Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 15 Jan 2025 09:47:12 +0100 Subject: [PATCH 41/82] [mlir][IR] Turn `FloatType` into a type interface (#118891) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it possible to add new MLIR floating point types in downstream projects. (Adding new APFloat semantics in downstream projects is not possible yet, so parsing/printing/converting float literals of newly added types is not supported.) Also removes two functions where we had to hard-code all existing floating point types (`FloatType::classof`). See discussion here: https://discourse.llvm.org/t/rethink-on-approach-to-low-precision-fp-types/82361 No measurable compilation time changes for these lit tests: ``` Benchmark 1: mlir-opt ./mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir -split-input-file -convert-vector-to-llvm -o /dev/null BEFORE Time (mean ± σ): 248.4 ms ± 3.2 ms [User: 237.0 ms, System: 20.1 ms] Range (min … max): 243.3 ms … 255.9 ms 30 runs AFTER Time (mean ± σ): 246.8 ms ± 3.2 ms [User: 233.2 ms, System: 21.8 ms] Range (min … max): 240.2 ms … 252.1 ms 30 runs Benchmark 2: mlir-opt- ./mlir/test/Dialect/Arith/canonicalize.mlir -split-input-file -canonicalize -o /dev/null BEFORE Time (mean ± σ): 37.3 ms ± 1.8 ms [User: 31.6 ms, System: 30.4 ms] Range (min … max): 34.6 ms … 42.0 ms 200 runs AFTER Time (mean ± σ): 37.5 ms ± 2.0 ms [User: 31.5 ms, System: 29.2 ms] Range (min … max): 34.5 ms … 43.0 ms 200 runs Benchmark 3: mlir-opt ./mlir/test/Dialect/Tensor/canonicalize.mlir -split-input-file -canonicalize -allow-unregistered-dialect -o /dev/null BEFORE Time (mean ± σ): 152.2 ms ± 2.5 ms [User: 140.1 ms, System: 12.2 ms] Range (min … max): 147.6 ms … 161.8 ms 200 runs AFTER Time (mean ± σ): 151.9 ms ± 2.7 ms [User: 140.5 ms, System: 11.5 ms] Range (min … max): 147.2 ms … 159.1 ms 200 runs ``` A micro benchmark that parses + prints 32768 floats with random floating-point type shows a slowdown from 55.1 ms -> 48.3 ms. --- mlir/include/mlir/IR/BuiltinTypeInterfaces.h | 9 ++ mlir/include/mlir/IR/BuiltinTypeInterfaces.td | 59 ++++++++++ mlir/include/mlir/IR/BuiltinTypes.h | 56 --------- mlir/include/mlir/IR/BuiltinTypes.td | 17 ++- mlir/lib/IR/BuiltinTypeInterfaces.cpp | 13 +++ mlir/lib/IR/BuiltinTypes.cpp | 106 ++++++++---------- mlir/unittests/IR/InterfaceAttachmentTest.cpp | 2 +- 7 files changed, 138 insertions(+), 124 deletions(-) diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.h b/mlir/include/mlir/IR/BuiltinTypeInterfaces.h index ed5e5ca22c595..e8011b5488dc9 100644 --- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.h +++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.h @@ -11,6 +11,15 @@ #include "mlir/IR/Types.h" +namespace llvm { +struct fltSemantics; +} // namespace llvm + +namespace mlir { +class FloatType; +class MLIRContext; +} // namespace mlir + #include "mlir/IR/BuiltinTypeInterfaces.h.inc" #endif // MLIR_IR_BUILTINTYPEINTERFACES_H diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td index c9dcd546cf67c..c36b738e38f42 100644 --- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td +++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td @@ -16,6 +16,65 @@ include "mlir/IR/OpBase.td" +def FloatTypeInterface : TypeInterface<"FloatType"> { + let cppNamespace = "::mlir"; + let description = [{ + This type interface should be implemented by all floating-point types. It + defines the LLVM APFloat semantics and provides a few helper functions. + }]; + + let methods = [ + InterfaceMethod< + /*desc=*/[{ + Returns the APFloat semantics for this floating-point type. + }], + /*retTy=*/"const ::llvm::fltSemantics &", + /*methodName=*/"getFloatSemantics", + /*args=*/(ins) + >, + InterfaceMethod< + /*desc=*/[{ + Returns a float type with bitwidth scaled by `scale`. Returns a "null" + float type if the scaled element type cannot be represented. + }], + /*retTy=*/"::mlir::FloatType", + /*methodName=*/"scaleElementBitwidth", + /*args=*/(ins "unsigned":$scale), + /*methodBody=*/"", + /*defaultImplementation=*/"return ::mlir::FloatType();" + > + ]; + + let extraClassDeclaration = [{ + // Convenience factories. + static FloatType getBF16(MLIRContext *ctx); + static FloatType getF16(MLIRContext *ctx); + static FloatType getF32(MLIRContext *ctx); + static FloatType getTF32(MLIRContext *ctx); + static FloatType getF64(MLIRContext *ctx); + static FloatType getF80(MLIRContext *ctx); + static FloatType getF128(MLIRContext *ctx); + static FloatType getFloat8E5M2(MLIRContext *ctx); + static FloatType getFloat8E4M3(MLIRContext *ctx); + static FloatType getFloat8E4M3FN(MLIRContext *ctx); + static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx); + static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx); + static FloatType getFloat8E4M3B11FNUZ(MLIRContext *ctx); + static FloatType getFloat8E3M4(MLIRContext *ctx); + static FloatType getFloat4E2M1FN(MLIRContext *ctx); + static FloatType getFloat6E2M3FN(MLIRContext *ctx); + static FloatType getFloat6E3M2FN(MLIRContext *ctx); + static FloatType getFloat8E8M0FNU(MLIRContext *ctx); + + /// Return the bitwidth of this float type. + unsigned getWidth(); + + /// Return the width of the mantissa of this type. + /// The width includes the integer bit. + unsigned getFPMantissaWidth(); + }]; +} + //===----------------------------------------------------------------------===// // MemRefElementTypeInterface //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h index 7f9c470ffec30..2b3c2b6d1753d 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.h +++ b/mlir/include/mlir/IR/BuiltinTypes.h @@ -25,7 +25,6 @@ struct fltSemantics; namespace mlir { class AffineExpr; class AffineMap; -class FloatType; class IndexType; class IntegerType; class MemRefType; @@ -44,52 +43,6 @@ template class ValueSemantics : public TypeTrait::TraitBase {}; -//===----------------------------------------------------------------------===// -// FloatType -//===----------------------------------------------------------------------===// - -class FloatType : public Type { -public: - using Type::Type; - - // Convenience factories. - static FloatType getBF16(MLIRContext *ctx); - static FloatType getF16(MLIRContext *ctx); - static FloatType getF32(MLIRContext *ctx); - static FloatType getTF32(MLIRContext *ctx); - static FloatType getF64(MLIRContext *ctx); - static FloatType getF80(MLIRContext *ctx); - static FloatType getF128(MLIRContext *ctx); - static FloatType getFloat8E5M2(MLIRContext *ctx); - static FloatType getFloat8E4M3(MLIRContext *ctx); - static FloatType getFloat8E4M3FN(MLIRContext *ctx); - static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx); - static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx); - static FloatType getFloat8E4M3B11FNUZ(MLIRContext *ctx); - static FloatType getFloat8E3M4(MLIRContext *ctx); - static FloatType getFloat4E2M1FN(MLIRContext *ctx); - static FloatType getFloat6E2M3FN(MLIRContext *ctx); - static FloatType getFloat6E3M2FN(MLIRContext *ctx); - static FloatType getFloat8E8M0FNU(MLIRContext *ctx); - - /// Methods for support type inquiry through isa, cast, and dyn_cast. - static bool classof(Type type); - - /// Return the bitwidth of this float type. - unsigned getWidth(); - - /// Return the width of the mantissa of this type. - /// The width includes the integer bit. - unsigned getFPMantissaWidth(); - - /// Get or create a new FloatType with bitwidth scaled by `scale`. - /// Return null if the scaled element type cannot be represented. - FloatType scaleElementBitwidth(unsigned scale); - - /// Return the floating semantics of this float type. - const llvm::fltSemantics &getFloatSemantics(); -}; - //===----------------------------------------------------------------------===// // TensorType //===----------------------------------------------------------------------===// @@ -448,15 +401,6 @@ inline bool BaseMemRefType::isValidElementType(Type type) { llvm::isa(type); } -inline bool FloatType::classof(Type type) { - return llvm::isa(type); -} - inline FloatType FloatType::getFloat4E2M1FN(MLIRContext *ctx) { return Float4E2M1FNType::get(ctx); } diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index dca228097d782..fc50b28c09e41 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -79,8 +79,12 @@ def Builtin_Complex : Builtin_Type<"Complex", "complex"> { //===----------------------------------------------------------------------===// // Base class for Builtin dialect float types. -class Builtin_FloatType - : Builtin_Type { +class Builtin_FloatType declaredInterfaceMethods = []> + : Builtin_Type]> { let extraClassDeclaration = [{ static }] # name # [{Type get(MLIRContext *context); }]; @@ -322,14 +326,16 @@ def Builtin_Float8E8M0FNU : Builtin_FloatType<"Float8E8M0FNU", "f8E8M0FNU"> { //===----------------------------------------------------------------------===// // BFloat16Type -def Builtin_BFloat16 : Builtin_FloatType<"BFloat16", "bf16"> { +def Builtin_BFloat16 : Builtin_FloatType<"BFloat16", "bf16", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "bfloat16 floating-point type"; } //===----------------------------------------------------------------------===// // Float16Type -def Builtin_Float16 : Builtin_FloatType<"Float16", "f16"> { +def Builtin_Float16 : Builtin_FloatType<"Float16", "f16", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "16-bit floating-point type"; } @@ -343,7 +349,8 @@ def Builtin_FloatTF32 : Builtin_FloatType<"FloatTF32", "tf32"> { //===----------------------------------------------------------------------===// // Float32Type -def Builtin_Float32 : Builtin_FloatType<"Float32", "f32"> { +def Builtin_Float32 : Builtin_FloatType<"Float32", "f32", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "32-bit floating-point type"; } diff --git a/mlir/lib/IR/BuiltinTypeInterfaces.cpp b/mlir/lib/IR/BuiltinTypeInterfaces.cpp index ab9e65b5edfed..c663f6c909460 100644 --- a/mlir/lib/IR/BuiltinTypeInterfaces.cpp +++ b/mlir/lib/IR/BuiltinTypeInterfaces.cpp @@ -8,6 +8,7 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/Sequence.h" using namespace mlir; @@ -19,6 +20,18 @@ using namespace mlir::detail; #include "mlir/IR/BuiltinTypeInterfaces.cpp.inc" +//===----------------------------------------------------------------------===// +// FloatType +//===----------------------------------------------------------------------===// + +unsigned FloatType::getWidth() { + return APFloat::semanticsSizeInBits(getFloatSemantics()); +} + +unsigned FloatType::getFPMantissaWidth() { + return APFloat::semanticsPrecision(getFloatSemantics()); +} + //===----------------------------------------------------------------------===// // ShapedType //===----------------------------------------------------------------------===// diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp index 6546234429c8c..41b794bc0aec5 100644 --- a/mlir/lib/IR/BuiltinTypes.cpp +++ b/mlir/lib/IR/BuiltinTypes.cpp @@ -87,72 +87,54 @@ IntegerType IntegerType::scaleElementBitwidth(unsigned scale) { } //===----------------------------------------------------------------------===// -// Float Type -//===----------------------------------------------------------------------===// - -unsigned FloatType::getWidth() { - return APFloat::semanticsSizeInBits(getFloatSemantics()); -} - -/// Returns the floating semantics for the given type. -const llvm::fltSemantics &FloatType::getFloatSemantics() { - if (llvm::isa(*this)) - return APFloat::Float4E2M1FN(); - if (llvm::isa(*this)) - return APFloat::Float6E2M3FN(); - if (llvm::isa(*this)) - return APFloat::Float6E3M2FN(); - if (llvm::isa(*this)) - return APFloat::Float8E5M2(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3FN(); - if (llvm::isa(*this)) - return APFloat::Float8E5M2FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3B11FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E3M4(); - if (llvm::isa(*this)) - return APFloat::Float8E8M0FNU(); - if (llvm::isa(*this)) - return APFloat::BFloat(); - if (llvm::isa(*this)) - return APFloat::IEEEhalf(); - if (llvm::isa(*this)) - return APFloat::FloatTF32(); - if (llvm::isa(*this)) - return APFloat::IEEEsingle(); - if (llvm::isa(*this)) - return APFloat::IEEEdouble(); - if (llvm::isa(*this)) - return APFloat::x87DoubleExtended(); - if (llvm::isa(*this)) - return APFloat::IEEEquad(); - llvm_unreachable("non-floating point type used"); -} - -FloatType FloatType::scaleElementBitwidth(unsigned scale) { - if (!scale) - return FloatType(); - MLIRContext *ctx = getContext(); - if (isF16() || isBF16()) { - if (scale == 2) - return FloatType::getF32(ctx); - if (scale == 4) - return FloatType::getF64(ctx); +// Float Types +//===----------------------------------------------------------------------===// + +// Mapping from MLIR FloatType to APFloat semantics. +#define FLOAT_TYPE_SEMANTICS(TYPE, SEM) \ + const llvm::fltSemantics &TYPE::getFloatSemantics() const { \ + return APFloat::SEM(); \ } - if (isF32()) - if (scale == 2) - return FloatType::getF64(ctx); +FLOAT_TYPE_SEMANTICS(Float4E2M1FNType, Float4E2M1FN) +FLOAT_TYPE_SEMANTICS(Float6E2M3FNType, Float6E2M3FN) +FLOAT_TYPE_SEMANTICS(Float6E3M2FNType, Float6E3M2FN) +FLOAT_TYPE_SEMANTICS(Float8E5M2Type, Float8E5M2) +FLOAT_TYPE_SEMANTICS(Float8E4M3Type, Float8E4M3) +FLOAT_TYPE_SEMANTICS(Float8E4M3FNType, Float8E4M3FN) +FLOAT_TYPE_SEMANTICS(Float8E5M2FNUZType, Float8E5M2FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E4M3FNUZType, Float8E4M3FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E4M3B11FNUZType, Float8E4M3B11FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E3M4Type, Float8E3M4) +FLOAT_TYPE_SEMANTICS(Float8E8M0FNUType, Float8E8M0FNU) +FLOAT_TYPE_SEMANTICS(BFloat16Type, BFloat) +FLOAT_TYPE_SEMANTICS(Float16Type, IEEEhalf) +FLOAT_TYPE_SEMANTICS(FloatTF32Type, FloatTF32) +FLOAT_TYPE_SEMANTICS(Float32Type, IEEEsingle) +FLOAT_TYPE_SEMANTICS(Float64Type, IEEEdouble) +FLOAT_TYPE_SEMANTICS(Float80Type, x87DoubleExtended) +FLOAT_TYPE_SEMANTICS(Float128Type, IEEEquad) +#undef FLOAT_TYPE_SEMANTICS + +FloatType Float16Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF32(getContext()); + if (scale == 4) + return FloatType::getF64(getContext()); return FloatType(); } -unsigned FloatType::getFPMantissaWidth() { - return APFloat::semanticsPrecision(getFloatSemantics()); +FloatType BFloat16Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF32(getContext()); + if (scale == 4) + return FloatType::getF64(getContext()); + return FloatType(); +} + +FloatType Float32Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF64(getContext()); + return FloatType(); } //===----------------------------------------------------------------------===// diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp index b6066dd5685dc..1b5d3b8c31bd2 100644 --- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp +++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp @@ -43,7 +43,7 @@ struct Model /// overrides default methods. struct OverridingModel : public TestExternalTypeInterface::ExternalModel { + Float32Type> { unsigned getBitwidthPlusArg(Type type, unsigned arg) const { return type.getIntOrFloatBitWidth() + arg; } From af656a8d4245069f70f5b5e1f654ec9291d3b0a3 Mon Sep 17 00:00:00 2001 From: Stephen Senran Zhang Date: Wed, 15 Jan 2025 16:53:31 +0800 Subject: [PATCH 42/82] [LVI] Learn value ranges from ctpop results (#121945) Fixes #115751. --- llvm/lib/Analysis/LazyValueInfo.cpp | 24 +++ .../CorrelatedValuePropagation/ctpop-range.ll | 142 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 349a0a1a2d3c4..20f69a0955f51 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1159,6 +1159,27 @@ getRangeViaSLT(CmpInst::Predicate Pred, APInt RHS, return std::nullopt; } +/// Get value range for a "ctpop(Val) Pred RHS" condition. +static ValueLatticeElement getValueFromICmpCtpop(ICmpInst::Predicate Pred, + Value *RHS) { + unsigned BitWidth = RHS->getType()->getScalarSizeInBits(); + + auto *RHSConst = dyn_cast(RHS); + if (!RHSConst) + return ValueLatticeElement::getOverdefined(); + + ConstantRange ResValRange = + ConstantRange::makeExactICmpRegion(Pred, RHSConst->getValue()); + + unsigned ResMin = ResValRange.getUnsignedMin().getLimitedValue(BitWidth); + unsigned ResMax = ResValRange.getUnsignedMax().getLimitedValue(BitWidth); + + APInt ValMin = APInt::getLowBitsSet(BitWidth, ResMin); + APInt ValMax = APInt::getHighBitsSet(BitWidth, ResMax); + return ValueLatticeElement::getRange( + ConstantRange::getNonEmpty(std::move(ValMin), ValMax + 1)); +} + std::optional LazyValueInfoImpl::getValueFromICmpCondition( Value *Val, ICmpInst *ICI, bool isTrueDest, bool UseBlockValue) { Value *LHS = ICI->getOperand(0); @@ -1192,6 +1213,9 @@ std::optional LazyValueInfoImpl::getValueFromICmpCondition( return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset, ICI, UseBlockValue); + if (match(LHS, m_Intrinsic(m_Specific(Val)))) + return getValueFromICmpCtpop(EdgePred, RHS); + const APInt *Mask, *C; if (match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) && match(RHS, m_APInt(C))) { diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll new file mode 100644 index 0000000000000..7101244dff4c4 --- /dev/null +++ b/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=correlated-propagation %s | FileCheck %s + +declare void @use(i1) + +define void @ctpop1(i8 %v) { +; CHECK-LABEL: define void @ctpop1( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C0_0:%.*]] = icmp samesign uge i8 [[RES]], 3 +; CHECK-NEXT: [[C0_1:%.*]] = icmp samesign ule i8 [[RES]], 7 +; CHECK-NEXT: [[C0:%.*]] = and i1 [[C0_0]], [[C0_1]] +; CHECK-NEXT: br i1 [[C0]], label %[[RANGE_3_8:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_3_8]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP1:%.*]] = icmp uge i8 [[V]], 8 +; CHECK-NEXT: call void @use(i1 [[CMP1]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP3:%.*]] = icmp ule i8 [[V]], -3 +; CHECK-NEXT: call void @use(i1 [[CMP3]]) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %v) + %c0.0 = icmp uge i8 %res, 3 + %c0.1 = icmp ule i8 %res, 7 + %c0 = and i1 %c0.0, %c0.1 + br i1 %c0, label %range.3.8, label %ed + +range.3.8: + %cmp0 = icmp uge i8 %v, 7 + call void @use(i1 %cmp0) ; true + %cmp1 = icmp uge i8 %v, 8 + call void @use(i1 %cmp1) ; unknown + %cmp2 = icmp ule i8 %v, 254 + call void @use(i1 %cmp2) ; true + %cmp3 = icmp ule i8 %v, 253 + call void @use(i1 %cmp3) ; unknown + ret void + +ed: + ret void +} + +define void @ctpop2(i8 %v) { +; CHECK-LABEL: define void @ctpop2( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C2_0:%.*]] = icmp samesign uge i8 [[RES]], 1 +; CHECK-NEXT: [[C2_1:%.*]] = icmp samesign ule i8 [[RES]], 4 +; CHECK-NEXT: [[C2:%.*]] = and i1 [[C2_0]], [[C2_1]] +; CHECK-NEXT: br i1 [[C2]], label %[[RANGE_1_5:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_1_5]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP9:%.*]] = icmp uge i8 [[V]], 2 +; CHECK-NEXT: call void @use(i1 [[CMP9]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP11:%.*]] = icmp ule i8 [[V]], -17 +; CHECK-NEXT: call void @use(i1 [[CMP11]]) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %v) + %c2.0 = icmp uge i8 %res, 1 + %c2.1 = icmp ule i8 %res, 4 + %c2 = and i1 %c2.0, %c2.1 + br i1 %c2, label %range.1.5, label %ed + +range.1.5: + %cmp8 = icmp uge i8 %v, 1 + call void @use(i1 %cmp8) ; true + %cmp9 = icmp uge i8 %v, 2 + call void @use(i1 %cmp9) ; unknown + %cmp10 = icmp ule i8 %v, 240 + call void @use(i1 %cmp10) ; true + %cmp11 = icmp ule i8 %v, 239 + call void @use(i1 %cmp11) ; unknown + ret void + +ed: + ret void +} + +define void @ctpop3(i8 %v) { +; CHECK-LABEL: define void @ctpop3( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C3:%.*]] = icmp samesign uge i8 [[RES]], 8 +; CHECK-NEXT: br i1 [[C3]], label %[[RANGE_8_9:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_8_9]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call i8 @llvm.ctpop.i8(i8 %v) + %c3 = icmp uge i8 %res, 8 + br i1 %c3, label %range.8.9, label %ed + +range.8.9: + %cmp4 = icmp eq i8 %v, -1 + call void @use(i1 %cmp4) ; true + ret void + +ed: + ret void +} + +define void @ctpop4(i8 %v) { +; CHECK-LABEL: define void @ctpop4( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[TEST4:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C4:%.*]] = icmp eq i8 [[RES]], 0 +; CHECK-NEXT: br i1 [[C4]], label %[[RANGE_0_1:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_0_1]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +test4: + %res = call i8 @llvm.ctpop.i8(i8 %v) + %c4 = icmp eq i8 %res, 0 + br i1 %c4, label %range.0.1, label %ed + +range.0.1: + %cmp5 = icmp eq i8 %v, 0 + call void @use(i1 %cmp5) ; true + ret void + +ed: + ret void +} From 2c34632a9977a82ce6262d95f07addb772ba7014 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Wed, 15 Jan 2025 16:59:40 +0800 Subject: [PATCH 43/82] [C++20] [Modules] [Driver] Support -print-library-module-manifest-path for libstdc++ Given libstdc++ has landed std module, the build systems may need clang to find the configuration file to understand how to build the std module. This patch did this. Tested with locally installed GCC-trunk. --- clang/lib/Driver/Driver.cpp | 21 ++++++++++++++++--- ...les-print-library-module-manifest-path.cpp | 5 ++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 9a947f32283c3..eefbdca805739 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6501,9 +6501,24 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C, return evaluate("libc++.a").value_or(error); } - case ToolChain::CST_Libstdcxx: - // libstdc++ does not provide Standard library modules yet. - return error; + case ToolChain::CST_Libstdcxx: { + auto evaluate = [&](const char *library) -> std::optional { + std::string lib = GetFilePath(library, TC); + + SmallString<128> path(lib.begin(), lib.end()); + llvm::sys::path::remove_filename(path); + llvm::sys::path::append(path, "libstdc++.modules.json"); + if (TC.getVFS().exists(path)) + return static_cast(path); + + return {}; + }; + + if (std::optional result = evaluate("libstdc++.so"); result) + return *result; + + return evaluate("libstdc++.a").value_or(error); + } } return error; diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp index 8d17fe1549e34..7606713bfa22a 100644 --- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp +++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp @@ -48,6 +48,9 @@ // RUN: --target=x86_64-linux-gnu 2>&1 \ // RUN: | FileCheck libcxx-no-shared-lib.cpp +// Testing with libstdc++ +// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libstdc++.so +// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libstdc++.modules.json // RUN: %clang -print-library-module-manifest-path \ // RUN: -stdlib=libstdc++ \ // RUN: -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \ @@ -74,4 +77,4 @@ //--- libstdcxx.cpp -// CHECK: +// CHECK: {{.*}}libstdc++.modules.json \ No newline at end of file From 4cec0ba92955c353c52efe728b2cfef3fbdf60f8 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 15 Jan 2025 10:09:10 +0100 Subject: [PATCH 44/82] [clang][bytecode][NFC] Simplify VisitCXXDefaultArgExpr (#123024) We have `discard()` these days. --- clang/lib/AST/ByteCode/Compiler.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index c6e2a1e50a2aa..4bfb80589620c 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4815,12 +4815,7 @@ template bool Compiler::VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) { SourceLocScope SLS(this, E); - const Expr *SubExpr = E->getExpr(); - if (std::optional T = classify(E->getExpr())) - return this->visit(SubExpr); - - assert(Initializing); - return this->visitInitializer(SubExpr); + return this->delegate(E->getExpr()); } template From 04b002bbb838bc502bd6d5f602af95efd6cc96b3 Mon Sep 17 00:00:00 2001 From: Sergey Kachkov <109674256+skachkov-sc@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:19:23 +0300 Subject: [PATCH 45/82] [IRBuilder] Add Align argument for CreateMaskedExpandLoad and CreateMaskedCompressStore (#122878) This patch adds possibility to specify alignment for llvm.masked.expandload/llvm.masked.compressstore intrinsics in IRBuilder (this is mostly NFC for now since it's only used in MemorySanitizer, but there is an intention to generate these intrinsics in the compiler passes, e.g. in LoopVectorizer) --- llvm/include/llvm/IR/IRBuilder.h | 5 +++-- llvm/lib/IR/IRBuilder.cpp | 20 ++++++++++++++----- .../Instrumentation/MemorySanitizer.cpp | 13 +++++++----- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 0332a6cc2e76e..833c91fd97461 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -854,12 +854,13 @@ class IRBuilderBase { Value *Mask = nullptr); /// Create a call to Masked Expand Load intrinsic - CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr, + CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, MaybeAlign Align, + Value *Mask = nullptr, Value *PassThru = nullptr, const Twine &Name = ""); /// Create a call to Masked Compress Store intrinsic - CallInst *CreateMaskedCompressStore(Value *Val, Value *Ptr, + CallInst *CreateMaskedCompressStore(Value *Val, Value *Ptr, MaybeAlign Align, Value *Mask = nullptr); /// Return an all true boolean vector (mask) with \p NumElts lanes. diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 27b499e42a4e4..d46ae206890e8 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -644,13 +644,15 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, /// Create a call to Masked Expand Load intrinsic /// \p Ty - vector type to load /// \p Ptr - base pointer for the load +/// \p Align - alignment of \p Ptr /// \p Mask - vector of booleans which indicates what vector lanes should /// be accessed in memory /// \p PassThru - pass-through value that is used to fill the masked-off lanes /// of the result /// \p Name - name of the result variable CallInst *IRBuilderBase::CreateMaskedExpandLoad(Type *Ty, Value *Ptr, - Value *Mask, Value *PassThru, + MaybeAlign Align, Value *Mask, + Value *PassThru, const Twine &Name) { assert(Ty->isVectorTy() && "Type should be vector"); assert(Mask && "Mask should not be all-ones (null)"); @@ -658,24 +660,32 @@ CallInst *IRBuilderBase::CreateMaskedExpandLoad(Type *Ty, Value *Ptr, PassThru = PoisonValue::get(Ty); Type *OverloadedTypes[] = {Ty}; Value *Ops[] = {Ptr, Mask, PassThru}; - return CreateMaskedIntrinsic(Intrinsic::masked_expandload, Ops, - OverloadedTypes, Name); + CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_expandload, Ops, + OverloadedTypes, Name); + if (Align) + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), *Align)); + return CI; } /// Create a call to Masked Compress Store intrinsic /// \p Val - data to be stored, /// \p Ptr - base pointer for the store +/// \p Align - alignment of \p Ptr /// \p Mask - vector of booleans which indicates what vector lanes should /// be accessed in memory CallInst *IRBuilderBase::CreateMaskedCompressStore(Value *Val, Value *Ptr, + MaybeAlign Align, Value *Mask) { Type *DataTy = Val->getType(); assert(DataTy->isVectorTy() && "Val should be a vector"); assert(Mask && "Mask should not be all-ones (null)"); Type *OverloadedTypes[] = {DataTy}; Value *Ops[] = {Val, Ptr, Mask}; - return CreateMaskedIntrinsic(Intrinsic::masked_compressstore, Ops, - OverloadedTypes); + CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_compressstore, Ops, + OverloadedTypes); + if (Align) + CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), *Align)); + return CI; } template diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 429e323b6b7c2..0169320deae46 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3542,6 +3542,7 @@ struct MemorySanitizerVisitor : public InstVisitor { void handleMaskedExpandLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptr = I.getArgOperand(0); + MaybeAlign Align = I.getParamAlign(0); Value *Mask = I.getArgOperand(1); Value *PassThru = I.getArgOperand(2); @@ -3559,10 +3560,11 @@ struct MemorySanitizerVisitor : public InstVisitor { Type *ShadowTy = getShadowTy(&I); Type *ElementShadowTy = cast(ShadowTy)->getElementType(); auto [ShadowPtr, OriginPtr] = - getShadowOriginPtr(Ptr, IRB, ElementShadowTy, {}, /*isStore*/ false); + getShadowOriginPtr(Ptr, IRB, ElementShadowTy, Align, /*isStore*/ false); - Value *Shadow = IRB.CreateMaskedExpandLoad( - ShadowTy, ShadowPtr, Mask, getShadow(PassThru), "_msmaskedexpload"); + Value *Shadow = + IRB.CreateMaskedExpandLoad(ShadowTy, ShadowPtr, Align, Mask, + getShadow(PassThru), "_msmaskedexpload"); setShadow(&I, Shadow); @@ -3574,6 +3576,7 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(&I); Value *Values = I.getArgOperand(0); Value *Ptr = I.getArgOperand(1); + MaybeAlign Align = I.getParamAlign(1); Value *Mask = I.getArgOperand(2); if (ClCheckAccessAddress) { @@ -3585,9 +3588,9 @@ struct MemorySanitizerVisitor : public InstVisitor { Type *ElementShadowTy = getShadowTy(cast(Values->getType())->getElementType()); auto [ShadowPtr, OriginPtrs] = - getShadowOriginPtr(Ptr, IRB, ElementShadowTy, {}, /*isStore*/ true); + getShadowOriginPtr(Ptr, IRB, ElementShadowTy, Align, /*isStore*/ true); - IRB.CreateMaskedCompressStore(Shadow, ShadowPtr, Mask); + IRB.CreateMaskedCompressStore(Shadow, ShadowPtr, Align, Mask); // TODO: Store origins. } From b3924cb9ecc95aa428d48e58ef5f2629f5166e02 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 15 Jan 2025 10:23:28 +0100 Subject: [PATCH 46/82] [AMDGPU] Set Convergent property for image.(getlod/sample*) intrinsics which uses WQM (#122908) This change adds IntrConvergent property to image.getlod intrinsic and to several image.sample intrinsics. All image.sample intrinsics apart from LOD(_L), Level 0(_LZ), Derivative(_D) will be marked as Convergent. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +- llvm/test/CodeGen/AMDGPU/sink-image-sample.ll | 127 ++++++++++++++++-- 2 files changed, 126 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index b930d6983e225..b529642a55871 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -876,6 +876,8 @@ class AMDGPUSampleVariant extra_addr // Name of the {lod} or {clamp} argument that is appended to the coordinates, // if any. string LodOrClamp = ""; + + bit UsesWQM = false; } // AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE @@ -905,8 +907,9 @@ defset list AMDGPUSampleVariants = { } defset list AMDGPUSampleVariantsNoGradients = { + let UsesWQM = true in defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; - let Bias = true in + let Bias = true, UsesWQM = true in defm AMDGPUSample : AMDGPUSampleHelper_Clamp< "_B", "_b", [AMDGPUArg]>; let LodOrClamp = "lod" in @@ -1172,7 +1175,8 @@ defset list AMDGPUImageDimIntrinsics = { foreach dim = AMDGPUDims.NoMsaa in { def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< AMDGPUDimSampleProfile, - !if(NoMem, [IntrNoMem], [IntrReadMem]), + !listconcat(!if(NoMem, [IntrNoMem], [IntrReadMem]), + !if(sample.UsesWQM, [IntrConvergent], [])), !if(NoMem, [], [SDNPMemOperand])>; } } @@ -1188,7 +1192,8 @@ defset list AMDGPUImageDimIntrinsics = { foreach dim = AMDGPUDims.NoMsaa in { def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic< AMDGPUDimSampleNoReturnProfile, - [IntrWillReturn], [SDNPMemOperand]>; + !listconcat([IntrWillReturn], !if(sample.UsesWQM, [IntrConvergent], [])), + [SDNPMemOperand]>; } } foreach sample = AMDGPUSampleVariants in { diff --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll index e1273e1a4bcd0..eb8c3cadc4997 100644 --- a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll +++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll @@ -1,34 +1,143 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -; Test that image.sample instruction is sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image. +; Test that image.sample LOD(_L), Level 0(_LZ), Derivative(_D) instructions are sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image. ; GCN-LABEL: {{^}}sinking_img_sample: -; GCN-NOT: image_sample +; GCN-NOT: image_sample_l v +; GCN-NOT: image_sample_lz v +; GCN-NOT: image_sample_c_lz v +; GCN-NOT: image_sample_c_l v +; GCN-NOT: image_sample_d v +; GCN-NOT: image_sample_c_d v +; GCN-NOT: image_sample_d_cl v +; GCN-NOT: image_sample_c_d_cl v ; GCN: branch -; GCN: image_sample +; GCN: image_sample_l v +; GCN: image_sample_lz v +; GCN: image_sample_c_lz v +; GCN: image_sample_c_l v +; GCN: image_sample_d v +; GCN: image_sample_c_d v +; GCN: image_sample_d_cl v +; GCN: image_sample_c_d_cl v ; GCN: exp null -define amdgpu_ps float @sinking_img_sample() { +define amdgpu_ps float @sinking_img_sample(i1 %cond) { main_body: - %i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) - br i1 undef, label %endif1, label %if1 + %i1 = call <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i2 = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i3 = call <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i4 = call <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i5 = call <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i6 = call <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i7 = call <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i8 = call <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + br i1 %cond, label %endif1, label %if1 if1: ; preds = %main_body call void @llvm.amdgcn.kill(i1 false) #4 br label %exit endif1: ; preds = %main_body - %i22 = extractelement <3 x float> %i, i32 2 + %i22 = extractelement <3 x float> %i1, i32 1 %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1 + %i24 = extractelement <3 x float> %i2, i32 1 + %i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1 + %i26 = extractelement <3 x float> %i3, i32 1 + %i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1 + %i28 = extractelement <3 x float> %i4, i32 1 + %i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1 + %i30 = extractelement <3 x float> %i5, i32 1 + %i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1 + %i32 = extractelement <3 x float> %i6, i32 1 + %i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1 + %i34 = extractelement <3 x float> %i7, i32 1 + %i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1 + %i36 = extractelement <3 x float> %i8, i32 1 + %i37 = call nsz arcp contract float @llvm.fma.f32(float %i35, float %i36, float 0.000000e+00) #1 br label %exit exit: ; preds = %endif1, %if1 - %i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ] - ret float %i24 + %i38 = phi float [ poison, %if1 ], [ %i37, %endif1 ] + ret float %i38 } + + +; Test that image.sample instructions which use WQM are marked as Convergent and will be left in the first block. + +; GCN-LABEL: {{^}}no_sinking_img_sample: +; GCN: image_sample v +; GCN: image_sample_c v +; GCN: image_sample_cl v +; GCN: image_sample_c_cl v +; GCN: image_sample_b v +; GCN: image_sample_c_b v +; GCN: image_sample_b_cl v +; GCN: branch +; GCN-NOT: image_sample v +; GCN-NOT: image_sample_c v +; GCN-NOT: image_sample_cl v +; GCN-NOT: image_sample_c_cl v +; GCN-NOT: image_sample_b v +; GCN-NOT: image_sample_c_b v +; GCN-NOT: image_sample_b_cl v +; GCN: exp null + +define amdgpu_ps float @no_sinking_img_sample(i1 %cond) { +main_body: + %i1 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i2 = call <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i3 = call <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i4 = call <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i5 = call <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i6 = call <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i7 = call <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + br i1 %cond, label %endif1, label %if1 + +if1: ; preds = %main_body + call void @llvm.amdgcn.kill(i1 false) #4 + br label %exit + +endif1: ; preds = %main_body + %i22 = extractelement <3 x float> %i1, i32 2 + %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1 + %i24 = extractelement <3 x float> %i2, i32 2 + %i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1 + %i26 = extractelement <3 x float> %i3, i32 2 + %i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1 + %i28 = extractelement <3 x float> %i4, i32 2 + %i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1 + %i30 = extractelement <3 x float> %i5, i32 2 + %i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1 + %i32 = extractelement <3 x float> %i6, i32 2 + %i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1 + %i34 = extractelement <3 x float> %i7, i32 2 + %i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1 + br label %exit + +exit: ; preds = %endif1, %if1 + %i36 = phi float [ poison, %if1 ], [ %i35, %endif1 ] + ret float %i36 +} + ; Function Attrs: nounwind readonly willreturn declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn declare float @llvm.fma.f32(float, float, float) #2 From eb96c8c105226956c8ed5ab30699206f53de74f7 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 15 Jan 2025 10:37:06 +0100 Subject: [PATCH 47/82] [lldb] Implement (SB)Function::GetInstructions for discontinuous functions (#122933) The main change is to permit the disassembler class to process/store multiple (discontinuous) ranges of addresses. The result is not ambiguous because each instruction knows its size (in addition to its address), so we can check for discontinuity by looking at whether the next instruction begins where the previous ends. This patch doesn't handle the "disassemble" CLI command, which uses a more elaborate mechanism for disassembling and printing instructions. --- lldb/include/lldb/Core/Disassembler.h | 11 ++++++-- lldb/source/API/SBFunction.cpp | 2 +- lldb/source/API/SBInstructionList.cpp | 8 ++++++ lldb/source/Core/Disassembler.cpp | 27 ++++++++----------- lldb/source/Symbol/Function.cpp | 2 +- .../Python/sb_function_ranges.s | 11 ++++++++ 6 files changed, 41 insertions(+), 20 deletions(-) diff --git a/lldb/include/lldb/Core/Disassembler.h b/lldb/include/lldb/Core/Disassembler.h index e0ad4316e0249..21bacb14f9b25 100644 --- a/lldb/include/lldb/Core/Disassembler.h +++ b/lldb/include/lldb/Core/Disassembler.h @@ -428,7 +428,7 @@ class Disassembler : public std::enable_shared_from_this, static lldb::DisassemblerSP DisassembleRange(const ArchSpec &arch, const char *plugin_name, const char *flavor, const char *cpu, const char *features, - Target &target, const AddressRange &disasm_range, + Target &target, llvm::ArrayRef disasm_ranges, bool force_live_memory = false); static lldb::DisassemblerSP @@ -460,7 +460,11 @@ class Disassembler : public std::enable_shared_from_this, size_t ParseInstructions(Target &target, Address address, Limit limit, Stream *error_strm_ptr, - bool force_live_memory = false); + bool force_live_memory = false) { + m_instruction_list.Clear(); + return AppendInstructions(target, address, limit, error_strm_ptr, + force_live_memory); + } virtual size_t DecodeInstructions(const Address &base_addr, const DataExtractor &data, @@ -480,6 +484,9 @@ class Disassembler : public std::enable_shared_from_this, const char *flavor) = 0; protected: + size_t AppendInstructions(Target &target, Address address, Limit limit, + Stream *error_strm_ptr, bool force_live_memory); + // SourceLine and SourceLinesToDisplay structures are only used in the mixed // source and assembly display methods internal to this class. diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp index 414eccc357c0e..d07594c2e8c01 100644 --- a/lldb/source/API/SBFunction.cpp +++ b/lldb/source/API/SBFunction.cpp @@ -127,7 +127,7 @@ SBInstructionList SBFunction::GetInstructions(SBTarget target, sb_instructions.SetDisassembler(Disassembler::DisassembleRange( module_sp->GetArchitecture(), nullptr, flavor, target_sp->GetDisassemblyCPU(), target_sp->GetDisassemblyFeatures(), - *target_sp, m_opaque_ptr->GetAddressRange(), force_live_memory)); + *target_sp, m_opaque_ptr->GetAddressRanges(), force_live_memory)); } } return sb_instructions; diff --git a/lldb/source/API/SBInstructionList.cpp b/lldb/source/API/SBInstructionList.cpp index 3f37b984cb462..c18204375dff1 100644 --- a/lldb/source/API/SBInstructionList.cpp +++ b/lldb/source/API/SBInstructionList.cpp @@ -151,6 +151,10 @@ bool SBInstructionList::GetDescription(Stream &sref) { FormatEntity::Parse("${addr}: ", format); SymbolContext sc; SymbolContext prev_sc; + + // Expected address of the next instruction. Used to print an empty line + // for non-contiguous blocks of insns. + std::optional
next_addr; for (size_t i = 0; i < num_instructions; ++i) { Instruction *inst = m_opaque_sp->GetInstructionList().GetInstructionAtIndex(i).get(); @@ -165,10 +169,14 @@ bool SBInstructionList::GetDescription(Stream &sref) { addr, eSymbolContextEverything, sc); } + if (next_addr && *next_addr != addr) + sref.EOL(); inst->Dump(&sref, max_opcode_byte_size, true, false, /*show_control_flow_kind=*/false, nullptr, &sc, &prev_sc, &format, 0); sref.EOL(); + next_addr = addr; + next_addr->Slide(inst->GetOpcode().GetByteSize()); } return true; } diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp index b3e7c4c13061d..b05be7e1a46d7 100644 --- a/lldb/source/Core/Disassembler.cpp +++ b/lldb/source/Core/Disassembler.cpp @@ -123,22 +123,19 @@ static Address ResolveAddress(Target &target, const Address &addr) { lldb::DisassemblerSP Disassembler::DisassembleRange( const ArchSpec &arch, const char *plugin_name, const char *flavor, const char *cpu, const char *features, Target &target, - const AddressRange &range, bool force_live_memory) { - if (range.GetByteSize() <= 0) - return {}; - - if (!range.GetBaseAddress().IsValid()) - return {}; - + llvm::ArrayRef disasm_ranges, bool force_live_memory) { lldb::DisassemblerSP disasm_sp = Disassembler::FindPluginForTarget( target, arch, flavor, cpu, features, plugin_name); if (!disasm_sp) return {}; - const size_t bytes_disassembled = disasm_sp->ParseInstructions( - target, range.GetBaseAddress(), {Limit::Bytes, range.GetByteSize()}, - nullptr, force_live_memory); + size_t bytes_disassembled = 0; + for (const AddressRange &range : disasm_ranges) { + bytes_disassembled += disasm_sp->AppendInstructions( + target, range.GetBaseAddress(), {Limit::Bytes, range.GetByteSize()}, + nullptr, force_live_memory); + } if (bytes_disassembled == 0) return {}; @@ -1092,11 +1089,9 @@ InstructionList::GetIndexOfInstructionAtLoadAddress(lldb::addr_t load_addr, return GetIndexOfInstructionAtAddress(address); } -size_t Disassembler::ParseInstructions(Target &target, Address start, - Limit limit, Stream *error_strm_ptr, - bool force_live_memory) { - m_instruction_list.Clear(); - +size_t Disassembler::AppendInstructions(Target &target, Address start, + Limit limit, Stream *error_strm_ptr, + bool force_live_memory) { if (!start.IsValid()) return 0; @@ -1129,7 +1124,7 @@ size_t Disassembler::ParseInstructions(Target &target, Address start, return DecodeInstructions(start, data, 0, limit.kind == Limit::Instructions ? limit.value : UINT32_MAX, - false, data_from_file); + /*append=*/true, data_from_file); } // Disassembler copy constructor diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp index c9523281dc565..15879f05a0ff0 100644 --- a/lldb/source/Symbol/Function.cpp +++ b/lldb/source/Symbol/Function.cpp @@ -488,7 +488,7 @@ lldb::DisassemblerSP Function::GetInstructions(const ExecutionContext &exe_ctx, if (module_sp && exe_ctx.HasTargetScope()) { return Disassembler::DisassembleRange( module_sp->GetArchitecture(), nullptr, nullptr, nullptr, flavor, - exe_ctx.GetTargetRef(), GetAddressRange(), !prefer_file_cache); + exe_ctx.GetTargetRef(), GetAddressRanges(), !prefer_file_cache); } return lldb::DisassemblerSP(); } diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s index a9e4104f2aaf7..2e2bc52cd3ff9 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s +++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s @@ -6,6 +6,16 @@ # CHECK: Found 1 function(s). # CHECK: foo: [input.o[0x0-0xe), input.o[0x14-0x1c)] +# CHECK-NEXT: input.o[0x0]: cmpl $0x0, %edi +# CHECK-NEXT: input.o[0x3]: je 0x14 +# CHECK-NEXT: input.o[0x5]: jmp 0x7 +# CHECK-NEXT: input.o[0x7]: callq 0xe +# CHECK-NEXT: input.o[0xc]: jmp 0x1b +# CHECK-EMPTY: +# CHECK-NEXT: input.o[0x14]: callq 0x19 +# CHECK-NEXT: input.o[0x19]: jmp 0x1b +# CHECK-NEXT: input.o[0x1b]: retq + #--- script.py import lldb @@ -17,6 +27,7 @@ def __lldb_init_module(debugger, internal_dict): for ctx in sym_ctxs: fn = ctx.function print(f"{fn.name}: {fn.GetRanges()}") + print(fn.GetInstructions(target)) #--- input.s # An example of a function which has been split into two parts. Roughly From e4708260c7e9eeb817cafa6db9eee2569f00b5d2 Mon Sep 17 00:00:00 2001 From: Jack Frankland Date: Wed, 15 Jan 2025 10:01:36 +0000 Subject: [PATCH 48/82] [vim] Improve `iskeyword` for MLIR (#121750) Define keywords for the MLIR syntax. This allows better recognition of semantic constructs such as SSA value identification e.g. `%foo` which gives improved motion handling when using 'word based' such as `w, e`. This is based on the work done for the LLVM IR in 8c46413f343d0a5b8db48d958890b9038f03b70d. Signed-off-by: Jack Frankland --- mlir/utils/vim/ftplugin/mlir.vim | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mlir/utils/vim/ftplugin/mlir.vim b/mlir/utils/vim/ftplugin/mlir.vim index 83b07f51001c0..2e3845cdddc32 100644 --- a/mlir/utils/vim/ftplugin/mlir.vim +++ b/mlir/utils/vim/ftplugin/mlir.vim @@ -10,3 +10,12 @@ let b:did_ftplugin = 1 setlocal softtabstop=2 shiftwidth=2 setlocal expandtab setlocal comments+=:// +setlocal commentstring=//\ %s +" We treat sequences of the following characters as forming 'keywords', with +" the aim of easing movement around MLIR identifiers: +" * identifier prefixes: '%' and '@' (@-@) +" * all characters where isalpha() returns TRUE (@) +" * the digits 0-9 (48-57) +" * other characters that may form identifiers: '_', '.', '-', '$' +" Comment this out to restore the default behaviour +setlocal iskeyword=%,@-@,@,48-57,_,.,-,$ From bd768246da23ad141d3e9303cf43fd4363a6d4f4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 14 Jan 2025 23:05:09 +0000 Subject: [PATCH 49/82] [DAG] replaceShuffleOfInsert - convert INSERT_VECTOR_ELT matching to use SDPatternMatch helpers. NFC. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 02b79c67af3ee..6805e0cb23ace 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26160,26 +26160,27 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && "Shuffle mask value must be from operand 0"); - if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - auto *InsIndexC = dyn_cast(Op0.getOperand(2)); - if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) - return SDValue(); + SDValue Elt; + if (sd_match(Op0, m_InsertElt(m_Value(), m_Value(Elt), + m_SpecificInt(Mask[ShufOp0Index])))) { + // There's an existing insertelement with constant insertion index, so we + // don't need to check the legality/profitability of a replacement operation + // that differs at most in the constant value. The target should be able to + // lower any of those in a similar way. If not, legalization will expand + // this to a scalar-to-vector plus shuffle. + // + // Note that the shuffle may move the scalar from the position that the + // insert element used. Therefore, our new insert element occurs at the + // shuffle's mask index value, not the insert's index value. + // + // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' + SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), + Op1, Elt, NewInsIndex); + } - // There's an existing insertelement with constant insertion index, so we - // don't need to check the legality/profitability of a replacement operation - // that differs at most in the constant value. The target should be able to - // lower any of those in a similar way. If not, legalization will expand this - // to a scalar-to-vector plus shuffle. - // - // Note that the shuffle may move the scalar from the position that the insert - // element used. Therefore, our new insert element occurs at the shuffle's - // mask index value, not the insert's index value. - // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' - SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), - Op1, Op0.getOperand(1), NewInsIndex); + return SDValue(); } /// If we have a unary shuffle of a shuffle, see if it can be folded away From 85fdf501461e8ee00401f06ee6c7d21ac6622484 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Wed, 15 Jan 2025 10:11:39 +0000 Subject: [PATCH 50/82] [Multilib] Custom flags YAML parsing (#122903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch is the first step to extend the current multilib system to support the selection of library variants which do not correspond to existing command-line options. Proposal can be found in https://discourse.llvm.org/t/rfc-multilib-custom-flags/81058 The multilib mechanism supports libraries that target code generation or language options such as `--target`, `-mcpu`, `-mfpu`, `-mbranch-protection`. However, some library variants are particular to features that do not correspond to any command-line options. Examples include variants for multithreading and semihosting. This work introduces a way to instruct the multilib system to consider these features in library selection. This particular patch comprises a new section in `multilib.yaml` to declare flags for which no option exists. Henceforth this sort of flag will be called `custom flag` for clarity. The `multilib.yaml` file will have a new section called Flags which contains the declarations of the target’s custom flags: ```yaml Flags: - Name: multithreaded Values: - Name: no-multithreaded MacroDefines: [__SINGLE_THREAD__] - Name: multithreaded Default: no-multithreaded - Name: io Values: - Name: io-none - Name: io-semihosting MacroDefines: [SEMIHOSTING] - Name: io-linux-syscalls MacroDefines: [LINUX_SYSCALLS, HOSTED=1] Default: io-none ``` - Name: the name to categorize a flag. - Values: a list of possible values. - Default: it specifies which value this flag should take if not specified in the command-line invocation. It must be one value from the Values field. Each flag Value follows this description: - Name (required): the name of the custom flag value (string). This is the string to be used in `-fmultilib-flag=`. - MacroDefines (optional): a list of strings to be used as macro definitions. Each string is fed into the driver as ``-D``. A Default value is useful to save users from specifying custom flags that have a most commonly used value. The namespace of flag values is common across all flags. This means that flag values must be unique. --- clang/include/clang/Driver/Multilib.h | 33 ++++- clang/lib/Driver/Multilib.cpp | 109 ++++++++++++-- ...remetal-multilib-custom-flags-parsing.yaml | 133 ++++++++++++++++++ 3 files changed, 264 insertions(+), 11 deletions(-) create mode 100644 clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h index dbed70f4f9008..0a533ed2804e2 100644 --- a/clang/include/clang/Driver/Multilib.h +++ b/clang/include/clang/Driver/Multilib.h @@ -101,6 +101,30 @@ class Multilib { raw_ostream &operator<<(raw_ostream &OS, const Multilib &M); +namespace custom_flag { +struct Declaration; + +struct ValueDetail { + std::string Name; + std::optional> MacroDefines; + Declaration *Decl; +}; + +struct Declaration { + std::string Name; + SmallVector ValueList; + std::optional DefaultValueIdx; + + Declaration() = default; + Declaration(const Declaration &); + Declaration(Declaration &&); + Declaration &operator=(const Declaration &); + Declaration &operator=(Declaration &&); +}; + +static constexpr StringRef Prefix = "-fmultilib-flag="; +} // namespace custom_flag + /// See also MultilibSetBuilder for combining multilibs into a set. class MultilibSet { public: @@ -120,15 +144,18 @@ class MultilibSet { private: multilib_list Multilibs; - std::vector FlagMatchers; + SmallVector FlagMatchers; + SmallVector CustomFlagDecls; IncludeDirsFunc IncludeCallback; IncludeDirsFunc FilePathsCallback; public: MultilibSet() = default; MultilibSet(multilib_list &&Multilibs, - std::vector &&FlagMatchers = {}) - : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {} + SmallVector &&FlagMatchers = {}, + SmallVector &&CustomFlagDecls = {}) + : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)), + CustomFlagDecls(std::move(CustomFlagDecls)) {} const multilib_list &getMultilibs() { return Multilibs; } diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp index 0207e0f2eb2de..ccf747e90cb2c 100644 --- a/clang/lib/Driver/Multilib.cpp +++ b/clang/lib/Driver/Multilib.cpp @@ -10,6 +10,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Driver/Driver.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" @@ -201,13 +202,20 @@ struct MultilibGroupSerialization { struct MultilibSetSerialization { llvm::VersionTuple MultilibVersion; - std::vector Groups; - std::vector Multilibs; - std::vector FlagMatchers; + SmallVector Groups; + SmallVector Multilibs; + SmallVector FlagMatchers; + SmallVector CustomFlagDeclarations; }; } // end anonymous namespace +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::Declaration) + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) { io.mapOptional("Dir", V.Dir); @@ -255,11 +263,61 @@ template <> struct llvm::yaml::MappingTraits { } }; +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &) { + io.mapRequired("Name", V.Name); + io.mapOptional("MacroDefines", V.MacroDefines); + } + static std::string validate(IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &NameSet) { + if (V.Name.empty()) + return "custom flag value requires a name"; + if (!NameSet.insert(V.Name).second) + return "duplicate custom flag value name: \"" + V.Name + "\""; + return {}; + } +}; + +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::Declaration &V, + llvm::SmallSet &NameSet) { + io.mapRequired("Name", V.Name); + io.mapRequired("Values", V.ValueList, NameSet); + std::string DefaultValueName; + io.mapRequired("Default", DefaultValueName); + + for (auto [Idx, Value] : llvm::enumerate(V.ValueList)) { + Value.Decl = &V; + if (Value.Name == DefaultValueName) { + assert(!V.DefaultValueIdx); + V.DefaultValueIdx = Idx; + } + } + } + static std::string validate(IO &io, custom_flag::Declaration &V, + llvm::SmallSet &) { + if (V.Name.empty()) + return "custom flag requires a name"; + if (V.ValueList.empty()) + return "custom flag must have at least one value"; + if (!V.DefaultValueIdx) + return "custom flag must have a default value"; + return {}; + } +}; + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) { io.mapRequired("MultilibVersion", M.MultilibVersion); io.mapRequired("Variants", M.Multilibs); io.mapOptional("Groups", M.Groups); + llvm::SmallSet NameSet; + io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet); io.mapOptional("Mappings", M.FlagMatchers); } static std::string validate(IO &io, MultilibSetSerialization &M) { @@ -288,10 +346,6 @@ template <> struct llvm::yaml::MappingTraits { } }; -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) - llvm::ErrorOr MultilibSet::parseYaml(llvm::MemoryBufferRef Input, llvm::SourceMgr::DiagHandlerTy DiagHandler, @@ -319,7 +373,8 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input, } } - return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers)); + return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers), + std::move(MS.CustomFlagDeclarations)); } LLVM_DUMP_METHOD void MultilibSet::dump() const { @@ -335,3 +390,41 @@ raw_ostream &clang::driver::operator<<(raw_ostream &OS, const MultilibSet &MS) { MS.print(OS); return OS; } + +namespace clang::driver::custom_flag { +Declaration::Declaration(const Declaration &Other) + : Name(Other.Name), ValueList(Other.ValueList), + DefaultValueIdx(Other.DefaultValueIdx) { + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; +} + +Declaration::Declaration(Declaration &&Other) + : Name(std::move(Other.Name)), ValueList(std::move(Other.ValueList)), + DefaultValueIdx(std::move(Other.DefaultValueIdx)) { + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; +} + +Declaration &Declaration::operator=(const Declaration &Other) { + if (this == &Other) + return *this; + Name = Other.Name; + ValueList = Other.ValueList; + DefaultValueIdx = Other.DefaultValueIdx; + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; + return *this; +} + +Declaration &Declaration::operator=(Declaration &&Other) { + if (this == &Other) + return *this; + Name = std::move(Other.Name); + ValueList = std::move(Other.ValueList); + DefaultValueIdx = std::move(Other.DefaultValueIdx); + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; + return *this; +} +} // namespace clang::driver::custom_flag diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml new file mode 100644 index 0000000000000..fe6a9a8d7f1ee --- /dev/null +++ b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml @@ -0,0 +1,133 @@ +# RUN: split-file %s %t + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# CHECK-NOT: error: + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME +# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES +# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT +# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME +# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME +# CHECK-DUPLICATE-FLAG-VALUE-NAME: error: duplicate custom flag value name: "value-name" +# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name + +#--- multilib-without-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + - Name: b + Default: a + +#--- multilib-with-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + MacroDefines: [FEATURE_A] + - Name: b + MacroDefines: [FEATURE_B] + Default: a + +#--- missing-flag-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Values: + - Name: a + Default: a + +#--- missing-flag-values.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + Default: a + +#--- missing-flag-value-default.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + Default: + +#--- missing-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: + Default: a + +#--- duplicate-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=value-name] + +Flags: + - Name: a + Values: + - Name: value-name + - Name: value-a + Default: value-name + - Name: b + Values: + - Name: value-name + Default: value-name From a5b88cb815d8f38698a3064a727b59143e0dae42 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 15 Jan 2025 10:14:51 +0000 Subject: [PATCH 51/82] [libclc] Add missing includes to CLC headers (#118654) There's no automatic way of checking these headers are self-contained. Instead of including these common files many times across the whole codebase, we can include them in the generic `gentype.inc` and `floatn.inc` files which are included by most CLC headers. --- libclc/{generic => clc}/include/clc/geometric/floatn.inc | 3 +++ libclc/clc/include/clc/integer/gentype.inc | 3 +++ libclc/clc/include/clc/math/gentype.inc | 3 +++ libclc/clc/include/clc/math/unary_intrin.inc | 3 +++ libclc/clc/include/clc/relational/clc_all.h | 1 + libclc/clc/include/clc/relational/clc_any.h | 1 + libclc/clc/include/clc/relational/clc_isequal.h | 1 + libclc/clc/include/clc/relational/clc_isinf.h | 1 + libclc/clc/include/clc/relational/clc_isnan.h | 1 + libclc/clc/include/clc/relational/floatn.inc | 2 ++ libclc/clc/include/clc/shared/clc_clamp.h | 3 --- 11 files changed, 19 insertions(+), 3 deletions(-) rename libclc/{generic => clc}/include/clc/geometric/floatn.inc (96%) diff --git a/libclc/generic/include/clc/geometric/floatn.inc b/libclc/clc/include/clc/geometric/floatn.inc similarity index 96% rename from libclc/generic/include/clc/geometric/floatn.inc rename to libclc/clc/include/clc/geometric/floatn.inc index 49c797f9f1845..919c2cadbff4f 100644 --- a/libclc/generic/include/clc/geometric/floatn.inc +++ b/libclc/clc/include/clc/geometric/floatn.inc @@ -1,3 +1,6 @@ +#include +#include + #define __CLC_FLOAT float #define __CLC_FPSIZE 32 diff --git a/libclc/clc/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc index 2c8dd143db879..98682a6d32c70 100644 --- a/libclc/clc/include/clc/integer/gentype.inc +++ b/libclc/clc/include/clc/integer/gentype.inc @@ -1,3 +1,6 @@ +#include +#include + // These 2 defines only change when switching between data sizes or base types // to keep this file manageable. #define __CLC_GENSIZE 8 diff --git a/libclc/clc/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc index 966b4269f66c1..87719f2d9bc0e 100644 --- a/libclc/clc/include/clc/math/gentype.inc +++ b/libclc/clc/include/clc/math/gentype.inc @@ -1,3 +1,6 @@ +#include +#include + #define __CLC_SCALAR_GENTYPE float #define __CLC_FPSIZE 32 diff --git a/libclc/clc/include/clc/math/unary_intrin.inc b/libclc/clc/include/clc/math/unary_intrin.inc index c331d3ff08a61..5ea2246244bef 100644 --- a/libclc/clc/include/clc/math/unary_intrin.inc +++ b/libclc/clc/include/clc/math/unary_intrin.inc @@ -1,3 +1,6 @@ +#include +#include + _CLC_OVERLOAD float __CLC_FUNCTION(float f) __asm(__CLC_INTRINSIC ".f32"); _CLC_OVERLOAD float2 __CLC_FUNCTION(float2 f) __asm(__CLC_INTRINSIC ".v2f32"); _CLC_OVERLOAD float3 __CLC_FUNCTION(float3 f) __asm(__CLC_INTRINSIC ".v3f32"); diff --git a/libclc/clc/include/clc/relational/clc_all.h b/libclc/clc/include/clc/relational/clc_all.h index bf068105aa1be..7be3d132dd53d 100644 --- a/libclc/clc/include/clc/relational/clc_all.h +++ b/libclc/clc/include/clc/relational/clc_all.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ALL_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_all(TYPE v); diff --git a/libclc/clc/include/clc/relational/clc_any.h b/libclc/clc/include/clc/relational/clc_any.h index f947b77e08341..27dbffeb2eecd 100644 --- a/libclc/clc/include/clc/relational/clc_any.h +++ b/libclc/clc/include/clc/relational/clc_any.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ANY_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_any(TYPE v); diff --git a/libclc/clc/include/clc/relational/clc_isequal.h b/libclc/clc/include/clc/relational/clc_isequal.h index 3a36ea24fd299..0f31fb9530a14 100644 --- a/libclc/clc/include/clc/relational/clc_isequal.h +++ b/libclc/clc/include/clc/relational/clc_isequal.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISEQUAL_DECL(TYPE, RETTYPE) \ _CLC_OVERLOAD _CLC_DECL RETTYPE __clc_isequal(TYPE x, TYPE y); diff --git a/libclc/clc/include/clc/relational/clc_isinf.h b/libclc/clc/include/clc/relational/clc_isinf.h index c33ef9bb9527d..3f60bec5654a2 100644 --- a/libclc/clc/include/clc/relational/clc_isinf.h +++ b/libclc/clc/include/clc/relational/clc_isinf.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISINF_DECL(RET_TYPE, ARG_TYPE) \ _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isinf(ARG_TYPE); diff --git a/libclc/clc/include/clc/relational/clc_isnan.h b/libclc/clc/include/clc/relational/clc_isnan.h index 08351eb5515f9..3200e593c5cff 100644 --- a/libclc/clc/include/clc/relational/clc_isnan.h +++ b/libclc/clc/include/clc/relational/clc_isnan.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISNAN_DECL(RET_TYPE, ARG_TYPE) \ _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isnan(ARG_TYPE); diff --git a/libclc/clc/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc index fc0d6878b4aa7..18fb20bd9effe 100644 --- a/libclc/clc/include/clc/relational/floatn.inc +++ b/libclc/clc/include/clc/relational/floatn.inc @@ -1,3 +1,5 @@ +#include +#include #define __CLC_FLOATN float #define __CLC_INTN int diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h index a84184c1750a5..d9d39413c5618 100644 --- a/libclc/clc/include/clc/shared/clc_clamp.h +++ b/libclc/clc/include/clc/shared/clc_clamp.h @@ -6,9 +6,6 @@ #define __clc_clamp clamp #else -#include -#include - #define __CLC_BODY #include From 9bc88280931e3b08adfab6951047191dfe12392b Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 15 Jan 2025 10:16:19 +0000 Subject: [PATCH 52/82] [OMPIRBuilder][MLIR] Add support for target 'if' clause (#122478) This patch implements support for handling the 'if' clause of OpenMP 'target' constructs in the OMPIRBuilder and updates MLIR to LLVM IR translation of the `omp.target` MLIR operation to make use of this new feature. --- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 14 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 210 ++++++++++-------- .../Frontend/OpenMPIRBuilderTest.cpp | 41 ++-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 11 +- mlir/test/Target/LLVMIR/omptarget-if.mlir | 68 ++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 11 - 6 files changed, 220 insertions(+), 135 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-if.mlir diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 7eceec3d8cf8f..6b6e5bc19d95a 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2994,27 +2994,29 @@ class OpenMPIRBuilder { /// \param Loc where the target data construct was encountered. /// \param IsOffloadEntry whether it is an offload entry. /// \param CodeGenIP The insertion point where the call to the outlined - /// function should be emitted. + /// function should be emitted. /// \param EntryInfo The entry information about the function. /// \param DefaultAttrs Structure containing the default attributes, including /// numbers of threads and teams to launch the kernel with. /// \param RuntimeAttrs Structure containing the runtime numbers of threads /// and teams to launch the kernel with. + /// \param IfCond value of the `if` clause. /// \param Inputs The input values to the region that will be passed. - /// as arguments to the outlined function. + /// as arguments to the outlined function. /// \param BodyGenCB Callback that will generate the region code. /// \param ArgAccessorFuncCB Callback that will generate accessors - /// instructions for passed in target arguments where neccessary + /// instructions for passed in target arguments where neccessary /// \param Dependencies A vector of DependData objects that carry - // dependency information as passed in the depend clause - // \param HasNowait Whether the target construct has a `nowait` clause or not. + /// dependency information as passed in the depend clause + /// \param HasNowait Whether the target construct has a `nowait` clause or + /// not. InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, - const TargetKernelRuntimeAttrs &RuntimeAttrs, + const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3d461f0ad4228..c6603635d5e28 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5308,8 +5308,8 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, Value *Alignment = AlignedItem.second; Instruction *loadInst = dyn_cast(AlignedPtr); Builder.SetInsertPoint(loadInst->getNextNode()); - Builder.CreateAlignmentAssumption(F->getDataLayout(), - AlignedPtr, Alignment); + Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr, + Alignment); } Builder.restoreIP(IP); } @@ -5457,16 +5457,16 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { Loop *L = LI.getLoopFor(CLI->getHeader()); assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop"); - TargetTransformInfo::UnrollingPreferences UP = - gatherUnrollingPreferences(L, SE, TTI, - /*BlockFrequencyInfo=*/nullptr, - /*ProfileSummaryInfo=*/nullptr, ORE, static_cast(OptLevel), - /*UserThreshold=*/std::nullopt, - /*UserCount=*/std::nullopt, - /*UserAllowPartial=*/true, - /*UserAllowRuntime=*/true, - /*UserUpperBound=*/std::nullopt, - /*UserFullUnrollMaxCount=*/std::nullopt); + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, SE, TTI, + /*BlockFrequencyInfo=*/nullptr, + /*ProfileSummaryInfo=*/nullptr, ORE, static_cast(OptLevel), + /*UserThreshold=*/std::nullopt, + /*UserCount=*/std::nullopt, + /*UserAllowPartial=*/true, + /*UserAllowRuntime=*/true, + /*UserUpperBound=*/std::nullopt, + /*UserFullUnrollMaxCount=*/std::nullopt); UP.Force = true; @@ -7340,7 +7340,7 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, - Function *OutlinedFn, Constant *OutlinedFnID, + Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector Dependencies = {}, @@ -7386,9 +7386,9 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, return Error::success(); }; - // If we don't have an ID for the target region, it means an offload entry - // wasn't created. In this case we just run the host fallback directly. - if (!OutlinedFnID) { + auto &&EmitTargetCallElse = + [&](OpenMPIRBuilder::InsertPointTy AllocaIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { // Assume no error was returned because EmitTargetCallFallbackCB doesn't // produce any. OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { @@ -7404,102 +7404,126 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, }()); Builder.restoreIP(AfterIP); - return; - } - - OpenMPIRBuilder::TargetDataInfo Info( - /*RequiresDevicePointerInfo=*/false, - /*SeparateBeginEndCalls=*/true); - - OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); - OpenMPIRBuilder::TargetDataRTArgs RTArgs; - OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info, - RTArgs, MapInfo, - /*IsNonContiguous=*/true, - /*ForEndCall=*/false); - - SmallVector NumTeamsC; - for (auto [DefaultVal, RuntimeVal] : - zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams)) - NumTeamsC.push_back(RuntimeVal ? RuntimeVal : Builder.getInt32(DefaultVal)); - - // Calculate number of threads: 0 if no clauses specified, otherwise it is the - // minimum between optional THREAD_LIMIT and NUM_THREADS clauses. - auto InitMaxThreadsClause = [&Builder](Value *Clause) { - if (Clause) - Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(), - /*isSigned=*/false); - return Clause; + return Error::success(); }; - auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) { - if (Clause) - Result = Result - ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause), + + auto &&EmitTargetCallThen = + [&](OpenMPIRBuilder::InsertPointTy AllocaIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { + OpenMPIRBuilder::TargetDataInfo Info( + /*RequiresDevicePointerInfo=*/false, + /*SeparateBeginEndCalls=*/true); + + OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); + OpenMPIRBuilder::TargetDataRTArgs RTArgs; + OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info, + RTArgs, MapInfo, + /*IsNonContiguous=*/true, + /*ForEndCall=*/false); + + SmallVector NumTeamsC; + for (auto [DefaultVal, RuntimeVal] : + zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams)) + NumTeamsC.push_back(RuntimeVal ? RuntimeVal + : Builder.getInt32(DefaultVal)); + + // Calculate number of threads: 0 if no clauses specified, otherwise it is + // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses. + auto InitMaxThreadsClause = [&Builder](Value *Clause) { + if (Clause) + Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(), + /*isSigned=*/false); + return Clause; + }; + auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) { + if (Clause) + Result = + Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause), Result, Clause) : Clause; - }; + }; - // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so - // the NUM_THREADS clause is overriden by THREAD_LIMIT. - SmallVector NumThreadsC; - Value *MaxThreadsClause = RuntimeAttrs.TeamsThreadLimit.size() == 1 - ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads) - : nullptr; + // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so + // the NUM_THREADS clause is overriden by THREAD_LIMIT. + SmallVector NumThreadsC; + Value *MaxThreadsClause = + RuntimeAttrs.TeamsThreadLimit.size() == 1 + ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads) + : nullptr; - for (auto [TeamsVal, TargetVal] : zip_equal(RuntimeAttrs.TeamsThreadLimit, - RuntimeAttrs.TargetThreadLimit)) { - Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal); - Value *NumThreads = InitMaxThreadsClause(TargetVal); + for (auto [TeamsVal, TargetVal] : zip_equal( + RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) { + Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal); + Value *NumThreads = InitMaxThreadsClause(TargetVal); - CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads); - CombineMaxThreadsClauses(MaxThreadsClause, NumThreads); + CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads); + CombineMaxThreadsClauses(MaxThreadsClause, NumThreads); - NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0)); - } + NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0)); + } - unsigned NumTargetItems = Info.NumberOfPtrs; - // TODO: Use correct device ID - Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); - uint32_t SrcLocStrSize; - Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); - Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, - llvm::omp::IdentFlag(0), 0); + unsigned NumTargetItems = Info.NumberOfPtrs; + // TODO: Use correct device ID + Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, + llvm::omp::IdentFlag(0), 0); - Value *TripCount = RuntimeAttrs.LoopTripCount - ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount, - Builder.getInt64Ty(), - /*isSigned=*/false) - : Builder.getInt64(0); + Value *TripCount = RuntimeAttrs.LoopTripCount + ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount, + Builder.getInt64Ty(), + /*isSigned=*/false) + : Builder.getInt64(0); - // TODO: Use correct DynCGGroupMem - Value *DynCGGroupMem = Builder.getInt32(0); + // TODO: Use correct DynCGGroupMem + Value *DynCGGroupMem = Builder.getInt32(0); - KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, - NumTeamsC, NumThreadsC, - DynCGGroupMem, HasNoWait); + KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, + NumTeamsC, NumThreadsC, + DynCGGroupMem, HasNoWait); - // Assume no error was returned because TaskBodyCB and - // EmitTargetCallFallbackCB don't produce any. - OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { - // The presence of certain clauses on the target directive require the - // explicit generation of the target task. - if (RequiresOuterTargetTask) - return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, - Dependencies, HasNoWait); + // Assume no error was returned because TaskBodyCB and + // EmitTargetCallFallbackCB don't produce any. + OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { + // The presence of certain clauses on the target directive require the + // explicit generation of the target task. + if (RequiresOuterTargetTask) + return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, + Dependencies, HasNoWait); + + return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, + EmitTargetCallFallbackCB, KArgs, + DeviceID, RTLoc, AllocaIP); + }()); + + Builder.restoreIP(AfterIP); + return Error::success(); + }; - return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, - EmitTargetCallFallbackCB, KArgs, - DeviceID, RTLoc, AllocaIP); - }()); + // If we don't have an ID for the target region, it means an offload entry + // wasn't created. In this case we just run the host fallback directly and + // ignore any potential 'if' clauses. + if (!OutlinedFnID) { + cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP())); + return; + } + + // If there's no 'if' clause, only generate the kernel launch code path. + if (!IfCond) { + cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP())); + return; + } - Builder.restoreIP(AfterIP); + cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen, + EmitTargetCallElse, AllocaIP)); } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, - const TargetKernelRuntimeAttrs &RuntimeAttrs, + const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl &Args, GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, @@ -7524,7 +7548,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( // to make a remote call (offload) to the previously outlined function // that represents the target region. Do that now. if (!Config.isTargetDevice()) - emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, + emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies, HasNowait); return Builder.saveIP(); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 3b571cce09a4f..a7b513bdfdc66 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6243,8 +6243,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), Builder.saveIP(), EntryInfo, DefaultAttrs, - RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB, - SimpleArgAccessorCB)); + RuntimeAttrs, /*IfCond=*/nullptr, Inputs, + GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6402,11 +6402,12 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -6561,8 +6562,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), Builder.saveIP(), EntryInfo, DefaultAttrs, - RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB, - SimpleArgAccessorCB)); + RuntimeAttrs, /*IfCond=*/nullptr, Inputs, + GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6660,11 +6661,12 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -6774,11 +6776,12 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0be515e63b470..abef2cb7411aa 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -183,10 +183,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { result = op.emitError("not yet implemented: host evaluation of loop " "bounds in omp.target operation"); }; - auto checkIf = [&todo](auto op, LogicalResult &result) { - if (op.getIfExpr()) - result = todo("if"); - }; auto checkInReduction = [&todo](auto op, LogicalResult &result) { if (!op.getInReductionVars().empty() || op.getInReductionByref() || op.getInReductionSyms()) @@ -306,7 +302,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { checkDevice(op, result); checkHasDeviceAddr(op, result); checkHostEval(op, result); - checkIf(op, result); checkInReduction(op, result); checkIsDevicePtr(op, result); checkPrivate(op, result); @@ -4378,10 +4373,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + llvm::Value *ifCond = nullptr; + if (Value targetIfCond = targetOp.getIfExpr()) + ifCond = moduleTranslation.lookupValue(targetIfCond); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = moduleTranslation.getOpenMPBuilder()->createTarget( ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo, - defaultAttrs, runtimeAttrs, kernelInput, genMapInfoCB, bodyCB, + defaultAttrs, runtimeAttrs, ifCond, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, dds, targetOp.getNowait()); if (failed(handleError(afterIP, opInst))) diff --git a/mlir/test/Target/LLVMIR/omptarget-if.mlir b/mlir/test/Target/LLVMIR/omptarget-if.mlir new file mode 100644 index 0000000000000..706ad4411438b --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-if.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} { + llvm.func @target_if_variable(%x : i1) { + omp.target if(%x) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_variable( + // CHECK-SAME: i1 %[[IF_COND:.*]]) + // CHECK: br i1 %[[IF_COND]], label %[[THEN_LABEL:.*]], label %[[ELSE_LABEL:.*]] + + // CHECK: [[THEN_LABEL]]: + // CHECK-NOT: {{^.*}}: + // CHECK: %[[RC:.*]] = call i32 @__tgt_target_kernel + // CHECK-NEXT: %[[OFFLOAD_SUCCESS:.*]] = icmp ne i32 %[[RC]], 0 + // CHECK-NEXT: br i1 %[[OFFLOAD_SUCCESS]], label %[[OFFLOAD_FAIL_LABEL:.*]], label %[[OFFLOAD_CONT_LABEL:.*]] + + // CHECK: [[OFFLOAD_FAIL_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN:__omp_offloading_.*_.*_target_if_variable_l.*]]() + // CHECK-NEXT: br label %[[OFFLOAD_CONT_LABEL]] + + // CHECK: [[OFFLOAD_CONT_LABEL]]: + // CHECK-NEXT: br label %[[END_LABEL:.*]] + + // CHECK: [[ELSE_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN]]() + // CHECK-NEXT: br label %[[END_LABEL]] + + llvm.func @target_if_true() { + %0 = llvm.mlir.constant(true) : i1 + omp.target if(%0) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_true() + // CHECK-NOT: {{^.*}}: + // CHECK: br label %[[ENTRY:.*]] + + // CHECK: [[ENTRY]]: + // CHECK-NOT: {{^.*}}: + // CHECK: %[[RC:.*]] = call i32 @__tgt_target_kernel + // CHECK-NEXT: %[[OFFLOAD_SUCCESS:.*]] = icmp ne i32 %[[RC]], 0 + // CHECK-NEXT: br i1 %[[OFFLOAD_SUCCESS]], label %[[OFFLOAD_FAIL_LABEL:.*]], label %[[OFFLOAD_CONT_LABEL:.*]] + + // CHECK: [[OFFLOAD_FAIL_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN:.*]]() + // CHECK-NEXT: br label %[[OFFLOAD_CONT_LABEL]] + + llvm.func @target_if_false() { + %0 = llvm.mlir.constant(false) : i1 + omp.target if(%0) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_false() + // CHECK-NEXT: br label %[[ENTRY:.*]] + + // CHECK: [[ENTRY]]: + // CHECK-NEXT: call void @__omp_offloading_{{.*}}_{{.*}}_target_if_false_l{{.*}}() +} + diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 392a6558dcfa6..c1e30964b2507 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -271,17 +271,6 @@ llvm.func @target_host_eval(%x : i32) { // ----- -llvm.func @target_if(%x : i1) { - // expected-error@below {{not yet implemented: Unhandled clause if in omp.target operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.target}} - omp.target if(%x) { - omp.terminator - } - llvm.return -} - -// ----- - omp.declare_reduction @add_f32 : f32 init { ^bb0(%arg: f32): From 2a044f8a092efb27fa1837f953bce8237d41e59b Mon Sep 17 00:00:00 2001 From: Will Froom Date: Wed, 15 Jan 2025 10:22:02 +0000 Subject: [PATCH 53/82] [MLIR] Add [[maybe_unused]] to variables on used in assert (#123037) Add [[maybe_unused]] to suppresses warnings when `-NDEBUG` is enabled --- .../lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index a674a59009181..95064083b21d4 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -1291,7 +1291,7 @@ using ExtractNBitsFn = /// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn) { - auto srcVecType = cast(srcValue.getType()); + [[maybe_unused]] auto srcVecType = cast(srcValue.getType()); assert(srcVecType.getElementType().isSignlessInteger(4) && "Expected i4 type"); @@ -1311,7 +1311,7 @@ static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc, /// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. static Value rewriteI2ToI8Ext(PatternRewriter &rewriter, Location loc, Value srcValue, const ExtractNBitsFn &extFn) { - VectorType srcVecType = cast(srcValue.getType()); + [[maybe_unused]] VectorType srcVecType = cast(srcValue.getType()); assert(srcVecType.getElementType().isSignlessInteger(2) && "Expected i2 type"); From defd0d966d5ebae37787e76b86f2f2ff2a5cfd59 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Wed, 15 Jan 2025 18:27:05 +0800 Subject: [PATCH 54/82] [libc] implement unistd/getentropy (#122692) Implement GNU extension getentropy. This function is used by many programs to acquire entropy without handling the loop of getrandom. --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/config/windows/entrypoints.txt | 3 ++ libc/config/windows/headers.txt | 1 + libc/include/sys/random.yaml | 7 ++++ libc/include/unistd.yaml | 7 ++++ libc/src/CMakeLists.txt | 2 +- libc/src/unistd/CMakeLists.txt | 7 ++++ libc/src/unistd/getentropy.h | 19 +++++++++ libc/src/unistd/linux/CMakeLists.txt | 15 +++++++ libc/src/unistd/linux/getentropy.cpp | 51 +++++++++++++++++++++++ libc/src/unistd/windows/CMakeLists.txt | 11 +++++ libc/src/unistd/windows/getentropy.cpp | 42 +++++++++++++++++++ libc/test/src/CMakeLists.txt | 2 +- libc/test/src/unistd/CMakeLists.txt | 12 ++++++ libc/test/src/unistd/getentropy_test.cpp | 28 +++++++++++++ 16 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 libc/src/unistd/getentropy.h create mode 100644 libc/src/unistd/linux/getentropy.cpp create mode 100644 libc/src/unistd/windows/CMakeLists.txt create mode 100644 libc/src/unistd/windows/getentropy.cpp create mode 100644 libc/test/src/unistd/getentropy_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index fc2b0e91c1286..f5ba341411768 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -322,6 +322,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.fsync libc.src.unistd.ftruncate libc.src.unistd.getcwd + libc.src.unistd.getentropy libc.src.unistd.geteuid libc.src.unistd.getpid libc.src.unistd.getppid diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 723853b2230ae..0c1ae9561a7e6 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -321,6 +321,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.fsync libc.src.unistd.ftruncate libc.src.unistd.getcwd + libc.src.unistd.getentropy libc.src.unistd.geteuid libc.src.unistd.getpid libc.src.unistd.getppid diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt index 4ecc3ada9c768..aad320995d339 100644 --- a/libc/config/windows/entrypoints.txt +++ b/libc/config/windows/entrypoints.txt @@ -101,6 +101,9 @@ set(TARGET_LIBC_ENTRYPOINTS # time.h entrypoints libc.src.time.time libc.src.time.clock_getres + + # unistd.h entrypoints + libc.src.unistd.getentropy ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/windows/headers.txt b/libc/config/windows/headers.txt index bccc04f7697e5..6d9aae9276924 100644 --- a/libc/config/windows/headers.txt +++ b/libc/config/windows/headers.txt @@ -6,4 +6,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.errno libc.include.fenv libc.include.math + libc.include.unistd ) diff --git a/libc/include/sys/random.yaml b/libc/include/sys/random.yaml index 4efb2fbb44733..a97266a5481df 100644 --- a/libc/include/sys/random.yaml +++ b/libc/include/sys/random.yaml @@ -15,3 +15,10 @@ functions: - type: void * - type: size_t - type: unsigned int + - name: getentropy + standards: + - GNUExtensions + return_type: int + arguments: + - type: void * + - type: size_t diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index fada365e0103d..c1901be446fe5 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -128,6 +128,13 @@ functions: arguments: - type: char * - type: size_t + - name: getentropy + standards: + - GNUExtensions + return_type: int + arguments: + - type: void * + - type: size_t - name: geteuid standards: - POSIX diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index 32308ba147940..41183429f67a7 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -15,6 +15,7 @@ add_subdirectory(string) add_subdirectory(strings) add_subdirectory(wchar) add_subdirectory(time) +add_subdirectory(unistd) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(dirent) @@ -23,7 +24,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(sched) add_subdirectory(sys) add_subdirectory(termios) - add_subdirectory(unistd) endif() if(NOT LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 1a0b2e3293d03..6bdea0c7693bd 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -350,3 +350,10 @@ add_entrypoint_object( DEPENDS libc.src.__support.threads.identifier ) + +add_entrypoint_object( + getentropy + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.getentropy +) diff --git a/libc/src/unistd/getentropy.h b/libc/src/unistd/getentropy.h new file mode 100644 index 0000000000000..27e13d2352d81 --- /dev/null +++ b/libc/src/unistd/getentropy.h @@ -0,0 +1,19 @@ +//===-- Implementation header for getentropy ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/size_t.h" +#include "src/__support/common.h" + +#ifndef LLVM_LIBC_SRC_UNISTD_GETENTROPY_H +#define LLVM_LIBC_SRC_UNISTD_GETENTROPY_H + +namespace LIBC_NAMESPACE_DECL { +int getentropy(void *buffer, size_t length); +} + +#endif // LLVM_LIBC_SRC_UNISTD_GETENTROPY_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index ed360c73354ac..2bb17f56f7b32 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -570,3 +570,18 @@ add_entrypoint_object( libc.src.__support.OSUtil.osutil libc.src.errno.errno ) + +add_entrypoint_object( + getentropy + SRCS + getentropy.cpp + HDRS + ../getentropy.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.ssize_t + libc.hdr.errno_macros + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp new file mode 100644 index 0000000000000..168a1197734ed --- /dev/null +++ b/libc/src/unistd/linux/getentropy.cpp @@ -0,0 +1,51 @@ +//===-- Linux implementation of getentropy --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getentropy.h" +#include "hdr/errno_macros.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" +#include "src/errno/libc_errno.h" + +#include // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { +LLVM_LIBC_FUNCTION(int, getentropy, (void *buffer, size_t length)) { + // check the length limit + if (length > 256) { + libc_errno = EIO; + return -1; + } + + char *cursor = static_cast(buffer); + while (length != 0) { + // 0 flag means urandom and blocking, which meets the assumption of + // getentropy + auto ret = syscall_impl(SYS_getrandom, cursor, length, 0); + + // on success, advance the buffer pointer + if (ret >= 0) { + length -= static_cast(ret); + cursor += ret; + continue; + } + + auto error = -static_cast(ret); + + // on EINTR, try again + if (error == EINTR) + continue; + + // on ENOSYS, forward errno and exit; + // otherwise, set EIO and exit + libc_errno = (error == ENOSYS) ? ENOSYS : EIO; + return -1; + } + return 0; +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/windows/CMakeLists.txt b/libc/src/unistd/windows/CMakeLists.txt new file mode 100644 index 0000000000000..195d98cdb51d4 --- /dev/null +++ b/libc/src/unistd/windows/CMakeLists.txt @@ -0,0 +1,11 @@ +add_entrypoint_object( + getentropy + SRCS + getentropy.cpp + HDRS + ../getentropy.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.errno_macros + libc.src.errno.errno +) diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp new file mode 100644 index 0000000000000..bfaec723ac63d --- /dev/null +++ b/libc/src/unistd/windows/getentropy.cpp @@ -0,0 +1,42 @@ +//===-- Windows implementation of getentropy ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getentropy.h" +#include "hdr/errno_macros.h" +#include "src/__support/common.h" +#include "src/errno/libc_errno.h" + +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#pragma comment(lib, "bcrypt.lib") + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, getentropy, (void *buffer, size_t length)) { + __try { + // check the length limit + if (length > 256) + __leave; + + NTSTATUS result = ::BCryptGenRandom(nullptr, static_cast(buffer), + static_cast(length), + BCRYPT_USE_SYSTEM_PREFERRED_RNG); + + if (result == STATUS_SUCCESS) + return 0; + + } __except (EXCEPTION_EXECUTE_HANDLER) { + // no need to handle exceptions specially + } + + libc_errno = EIO; + return -1; +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 31008508d6492..22ec43588f744 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -61,6 +61,7 @@ add_subdirectory(string) add_subdirectory(strings) add_subdirectory(wchar) add_subdirectory(time) +add_subdirectory(unistd) # Depends on utilities in stdlib add_subdirectory(inttypes) @@ -70,7 +71,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(sched) add_subdirectory(sys) add_subdirectory(termios) - add_subdirectory(unistd) endif() if(NOT LLVM_LIBC_FULL_BUILD) diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index e036e09cde702..c3eebdf2a877d 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -488,6 +488,18 @@ add_libc_test( libc.src.stdio.fflush ) +add_libc_test( + getentropy_test + SUITE + libc_unistd_unittests + SRCS + getentropy_test.cpp + DEPENDS + libc.src.unistd.getentropy + libc.src.errno.errno + libc.test.UnitTest.ErrnoSetterMatcher +) + if(LLVM_LIBC_FULL_BUILD) add_libc_test( _exit_test diff --git a/libc/test/src/unistd/getentropy_test.cpp b/libc/test/src/unistd/getentropy_test.cpp new file mode 100644 index 0000000000000..f7329ae419327 --- /dev/null +++ b/libc/test/src/unistd/getentropy_test.cpp @@ -0,0 +1,28 @@ +//===-- Unittests for getentropy ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/errno_macros.h" +#include "src/unistd/getentropy.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; + +TEST(LlvmLibcUnistdGetEntropyTest, LengthTooLong) { + char buf[1024]; + ASSERT_THAT(LIBC_NAMESPACE::getentropy(buf, 257), Fails(EIO)); +} + +TEST(LlvmLibcUnistdGetEntropyTest, SmokeTest) { + char buf[256]; + ASSERT_THAT(LIBC_NAMESPACE::getentropy(buf, 256), Succeeds()); +} + +TEST(LlvmLibcUnistdGetEntropyTest, OtherError) { + ASSERT_THAT(LIBC_NAMESPACE::getentropy(nullptr, 1), Fails(EIO)); +} From 9025c269aa0b394ea755978348f882f85013ed12 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 15 Jan 2025 10:51:44 +0000 Subject: [PATCH 55/82] [AArch64] Add an extra test case for adds and subs combines. NFC --- llvm/test/CodeGen/AArch64/adds_cmn.ll | 80 +++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/adds_cmn.ll diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll new file mode 100644 index 0000000000000..674a3893653a1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O3 -o - %s | FileCheck %s + +define { i32, i32 } @adds_cmn(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: adds_cmn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, w1 +; CHECK-NEXT: add w1, w0, w1 +; CHECK-NEXT: cset w8, lo +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = add nuw i32 %x, %y + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: adds_cmn_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, w1 +; CHECK-NEXT: add w1, w1, w0 +; CHECK-NEXT: cset w8, lo +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = add nuw i32 %y, %x + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @subs_cmp(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: subs_cmp: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs w1, w0, w1 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = sub nuw i32 %x, %y + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @subs_cmp_c(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: subs_cmp_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: sub w1, w1, w0 +; CHECK-NEXT: cset w8, hs +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = sub nuw i32 %y, %x + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} From e33f456ae591559883e89a1f18b2dec21225e90f Mon Sep 17 00:00:00 2001 From: LoS Date: Wed, 15 Jan 2025 12:26:36 +0100 Subject: [PATCH 56/82] Fixed some warn-override tests in SemaCXX (#122680) The `.cpp` extension have been added to test files, so that they can be runned. Besides, the `warn-suggest-override.cpp` tests have been fixed. --------- Co-authored-by: LoS --- ...e => warn-inconsistent-missing-destructor-override.cpp} | 0 ...uctor-override => warn-suggest-destructor-override.cpp} | 0 .../{warn-suggest-override => warn-suggest-override.cpp} | 7 ++++--- 3 files changed, 4 insertions(+), 3 deletions(-) rename clang/test/SemaCXX/{warn-inconsistent-missing-destructor-override => warn-inconsistent-missing-destructor-override.cpp} (100%) rename clang/test/SemaCXX/{warn-suggest-destructor-override => warn-suggest-destructor-override.cpp} (100%) rename clang/test/SemaCXX/{warn-suggest-override => warn-suggest-override.cpp} (58%) diff --git a/clang/test/SemaCXX/warn-inconsistent-missing-destructor-override b/clang/test/SemaCXX/warn-inconsistent-missing-destructor-override.cpp similarity index 100% rename from clang/test/SemaCXX/warn-inconsistent-missing-destructor-override rename to clang/test/SemaCXX/warn-inconsistent-missing-destructor-override.cpp diff --git a/clang/test/SemaCXX/warn-suggest-destructor-override b/clang/test/SemaCXX/warn-suggest-destructor-override.cpp similarity index 100% rename from clang/test/SemaCXX/warn-suggest-destructor-override rename to clang/test/SemaCXX/warn-suggest-destructor-override.cpp diff --git a/clang/test/SemaCXX/warn-suggest-override b/clang/test/SemaCXX/warn-suggest-override.cpp similarity index 58% rename from clang/test/SemaCXX/warn-suggest-override rename to clang/test/SemaCXX/warn-suggest-override.cpp index e06c939ff001f..c4b5149c681a4 100644 --- a/clang/test/SemaCXX/warn-suggest-override +++ b/clang/test/SemaCXX/warn-suggest-override.cpp @@ -17,13 +17,13 @@ struct C { struct D : public C { void run(); - // expected-warning@-1 {{'run()' overrides a member function but is not marked 'override'}} + // expected-warning@-1 {{'run' overrides a member function but is not marked 'override'}} ~D(); }; struct E : public C { virtual void run(); - // expected-warning@-1 {{'run()' overrides a member function but is not marked 'override'}} + // expected-warning@-1 {{'run' overrides a member function but is not marked 'override'}} virtual ~E(); }; @@ -32,7 +32,8 @@ struct F : public C { ~F() override; }; -struct G : public C { +struct G : public C { // expected-note {{mark 'G' as 'final'}} void run() final; ~G() final; + // expected-warning@-1 {{class with destructor marked 'final' cannot be inherited from}} }; From c8bbbaa5c70a32f31a072740c87708be8f15f831 Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 15 Jan 2025 03:56:28 -0800 Subject: [PATCH 57/82] [SelectionDAG][AMDGPU] Negative offset when selecting scratch sv offsets (#122251) APInt will fail when given a negative offset. SelectScratchSVAddr utilizes this function and can be given a negative offset as well, so this change modifies it to use APSInt instead. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 +- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 78 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 27e9018d68a03..9fa9cccd3e3ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1926,7 +1926,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( KnownBits VKnown = CurDAG->computeKnownBits(VAddr); KnownBits SKnown = KnownBits::add(CurDAG->computeKnownBits(SAddr), - KnownBits::makeConstant(APInt(32, ImmOffset))); + KnownBits::makeConstant(APInt(32, ImmOffset, + /*isSigned=*/true))); uint64_t VMax = VKnown.getMaxValue().getZExtValue(); uint64_t SMax = SKnown.getMaxValue().getZExtValue(); return (VMax & 3) + (SMax & 3) >= 4; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 066c04b1af088..ef3657433e8b7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -1243,3 +1243,81 @@ bb: store volatile i8 4, ptr addrspace(5) %p4 ret void } + +define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { +; GFX940-SDAG-LABEL: soff1_voff1_negative: +; GFX940-SDAG: ; %bb.0: ; %bb +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: s_endpgm +; +; GFX940-GISEL-LABEL: soff1_voff1_negative: +; GFX940-GISEL: ; %bb.0: ; %bb +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX940-GISEL-NEXT: v_add3_u32 v0, s0, v0, -1 +; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX940-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: soff1_voff1_negative: +; GFX11-SDAG: ; %bb.0: ; %bb +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: soff1_voff1_negative: +; GFX11-GISEL: ; %bb.0: ; %bb +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff1_voff1_negative: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff1_voff1_negative: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm +bb: + %a = alloca [64 x i8], align 4, addrspace(5) + %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff + %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 -1 + store volatile i8 1, ptr addrspace(5) %p1 + ret void +} From 6affc1837537a802531a5394535f1f0b7ca865cb Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 15 Jan 2025 13:07:42 +0100 Subject: [PATCH 58/82] [clang-reorder-fields] Move trailing comments. (#122918) Currently, trailing comments get mixed up: ``` struct Foo { int a; // This one is the cool field // within the struct. int b; }; ``` becomes: ``` struct Foo { int b; // This one is the cool field // within the struct. int a; }; ``` This should be: ``` struct Foo { int b; int a; // This one is the cool field // within the struct. }; ``` --- .../ReorderFieldsAction.cpp | 69 ++++++++++++++++--- .../test/clang-reorder-fields/Comments.cpp | 23 +++++++ 2 files changed, 84 insertions(+), 8 deletions(-) create mode 100644 clang-tools-extra/test/clang-reorder-fields/Comments.cpp diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp index dc3a3b6211b7e..80ee31368fe9a 100644 --- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp +++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp @@ -63,7 +63,9 @@ getNewFieldsOrder(const RecordDecl *Definition, NameToIndex[Field->getName()] = Field->getFieldIndex(); if (DesiredFieldsOrder.size() != NameToIndex.size()) { - llvm::errs() << "Number of provided fields doesn't match definition.\n"; + llvm::errs() << "Number of provided fields (" << DesiredFieldsOrder.size() + << ") doesn't match definition (" << NameToIndex.size() + << ").\n"; return {}; } SmallVector NewFieldsOrder; @@ -116,26 +118,77 @@ findMembersUsedInInitExpr(const CXXCtorInitializer *Initializer, return Results; } -/// Returns the full source range for the field declaration up to (not -/// including) the trailing semicolumn, including potential macro invocations, -/// e.g. `int a GUARDED_BY(mu);`. +/// Returns the next token after `Loc` (including comment tokens). +static std::optional getTokenAfter(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + if (Loc.isMacroID()) { + return std::nullopt; + } + Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); + + // Break down the source location. + std::pair LocInfo = SM.getDecomposedLoc(Loc); + + // Try to load the file buffer. + bool InvalidTemp = false; + StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); + if (InvalidTemp) + return std::nullopt; + + const char *TokenBegin = File.data() + LocInfo.second; + + Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), + TokenBegin, File.end()); + lexer.SetCommentRetentionState(true); + // Find the token. + Token Tok; + lexer.LexFromRawLexer(Tok); + return Tok; +} + +/// Returns the end of the trailing comments after `Loc`. +static SourceLocation getEndOfTrailingComment(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + // We consider any following comment token that is indented more than the + // first comment to be part of the trailing comment. + const unsigned Column = SM.getPresumedColumnNumber(Loc); + std::optional Tok = getTokenAfter(Loc, SM, LangOpts); + while (Tok && Tok->is(tok::comment) && + SM.getPresumedColumnNumber(Tok->getLocation()) > Column) { + Loc = Tok->getEndLoc(); + Tok = getTokenAfter(Loc, SM, LangOpts); + } + return Loc; +} + +/// Returns the full source range for the field declaration up to (including) +/// the trailing semicolumn, including potential macro invocations, +/// e.g. `int a GUARDED_BY(mu);`. If there is a trailing comment, include it. static SourceRange getFullFieldSourceRange(const FieldDecl &Field, const ASTContext &Context) { - SourceRange Range = Field.getSourceRange(); + const SourceRange Range = Field.getSourceRange(); + SourceLocation Begin = Range.getBegin(); SourceLocation End = Range.getEnd(); const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); while (true) { std::optional CurrentToken = Lexer::findNextToken(End, SM, LangOpts); - if (!CurrentToken || CurrentToken->is(tok::semi)) - break; + if (!CurrentToken) + return SourceRange(Begin, End); if (CurrentToken->is(tok::eof)) return Range; // Something is wrong, return the original range. + End = CurrentToken->getLastLoc(); + + if (CurrentToken->is(tok::semi)) + break; } - return SourceRange(Range.getBegin(), End); + End = getEndOfTrailingComment(End, SM, LangOpts); + return SourceRange(Begin, End); } /// Reorders fields in the definition of a struct/class. diff --git a/clang-tools-extra/test/clang-reorder-fields/Comments.cpp b/clang-tools-extra/test/clang-reorder-fields/Comments.cpp new file mode 100644 index 0000000000000..a31b6692c9ac7 --- /dev/null +++ b/clang-tools-extra/test/clang-reorder-fields/Comments.cpp @@ -0,0 +1,23 @@ +// RUN: clang-reorder-fields -record-name Foo -fields-order e1,e3,e2,a,c,b %s -- | FileCheck %s + +class Foo { + int a; // Trailing comment for a. + int b; // Multiline + // trailing for b. + // Prefix comments for c. + int c; + + /*c-like*/ int e1; + int /*c-like*/ e2; + int e3 /*c-like*/; +}; + +// CHECK: /*c-like*/ int e1; +// CHECK-NEXT: int e3 /*c-like*/; +// CHECK-NEXT: int /*c-like*/ e2; +// CHECK-NEXT: int a; // Trailing comment for a. +// CHECK-NEXT: // Prefix comments for c. +// CHECK-NEXT: int c; +// CHECK-NEXT: int b; // Multiline +// CHECK-NEXT: // trailing for b. + From f9350c9325bccb95e94583685bbb9322a15da610 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 15 Jan 2025 16:32:46 +0400 Subject: [PATCH 59/82] [clang][NFC] Update CODEOWNERS --- .github/CODEOWNERS | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ab8b75f415870..484b947bda402 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -27,7 +27,6 @@ /llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @nikic /llvm/lib/Transforms/InstCombine/ @nikic -/clang/include/clang/Sema/Sema.h @Endilll /clang/test/CXX/drs/ @Endilll /clang/www/cxx_dr_status.html @Endilll /clang/www/make_cxx_dr_status @Endilll From bfedf6460c2cad6e6f966b457d8d27084579dcd8 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 15 Jan 2025 12:47:43 +0000 Subject: [PATCH 60/82] [LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752) Currently when we encounter a negative step in the induction variable isDereferenceableAndAlignedInLoop bails out because the element size is signed greater than the step. This patch adds support for negative steps in cases where we detect the start address for the load is of the form base + offset. In this case the address decrements in each iteration so we need to calculate the access size differently. I have done this by caling getStartAndEndForAccess from LoopAccessAnalysis.cpp. The motivation for this patch comes from PR #88385 where a reviewer requested reusing isDereferenceableAndAlignedInLoop, but that PR itself does support reverse loops. The changed test in LoopVectorize/X86/load-deref-pred.ll now passes because previously we were calculating the total access size incorrectly, whereas now it is 412 bytes and fits perfectly into the alloca. --- .../llvm/Analysis/LoopAccessAnalysis.h | 19 ++ llvm/lib/Analysis/Loads.cpp | 109 ++++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 61 ++--- .../LoopVectorize/X86/load-deref-pred.ll | 238 ++++++------------ .../LoopVectorize/load-deref-pred-align.ll | 27 +- 5 files changed, 188 insertions(+), 266 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 31374a128856c..6fc6ca14d0889 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,6 +853,25 @@ bool sortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType = true); +/// Calculate Start and End points of memory access. +/// Let's assume A is the first access and B is a memory access on N-th loop +/// iteration. Then B is calculated as: +/// B = A + Step*N . +/// Step value may be positive or negative. +/// N is a calculated back-edge taken count: +/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 +/// Start and End points are calculated in the following way: +/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, +/// where SizeOfElt is the size of single memory access in bytes. +/// +/// There is no conflict when the intervals are disjoint: +/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) +std::pair getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, + DenseMap, + std::pair> *PointerBounds); + class LoopAccessInfoManager { /// The cache. DenseMap> LoopAccessInfoMap; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7bbd469bd035d..cc6760292c2ff 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" @@ -275,84 +276,88 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { bool llvm::isDereferenceableAndAlignedInLoop( LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC, SmallVectorImpl *Predicates) { + const Align Alignment = LI->getAlign(); auto &DL = LI->getDataLayout(); Value *Ptr = LI->getPointerOperand(); - APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), DL.getTypeStoreSize(LI->getType()).getFixedValue()); - const Align Alignment = LI->getAlign(); - - Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); // If given a uniform (i.e. non-varying) address, see if we can prove the // access is safe within the loop w/o needing predication. if (L->isLoopInvariant(Ptr)) - return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL, - HeaderFirstNonPHI, AC, &DT); + return isDereferenceableAndAlignedPointer( + Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT); + + const SCEV *PtrScev = SE.getSCEV(Ptr); + auto *AddRec = dyn_cast(PtrScev); - // Otherwise, check to see if we have a repeating access pattern where we can - // prove that all accesses are well aligned and dereferenceable. - auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); + // Check to see if we have a repeating access pattern and it's possible + // to prove all accesses are well aligned. if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) return false; + auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); if (!Step) return false; - auto TC = SE.getSmallConstantMaxTripCount(L, Predicates); - if (!TC) + // For the moment, restrict ourselves to the case where the access size is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (EltSize.urem(Alignment.value()) != 0) return false; // TODO: Handle overlapping accesses. - // We should be computing AccessSize as (TC - 1) * Step + EltSize. - if (EltSize.sgt(Step->getAPInt())) + if (EltSize.ugt(Step->getAPInt().abs())) + return false; + + const SCEV *MaxBECount = + SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates); + if (isa(MaxBECount)) + return false; + + const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( + L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr); + if (isa(AccessStart) || + isa(AccessEnd)) return false; - // Compute the total access size for access patterns with unit stride and - // patterns with gaps. For patterns with unit stride, Step and EltSize are the - // same. - // For patterns with gaps (i.e. non unit stride), we are - // accessing EltSize bytes at every Step. - APInt AccessSize = TC * Step->getAPInt(); + // Try to get the access size. + const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); + APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); - assert(SE.isLoopInvariant(AddRec->getStart(), L) && - "implied by addrec definition"); Value *Base = nullptr; - if (auto *StartS = dyn_cast(AddRec->getStart())) { - Base = StartS->getValue(); - } else if (auto *StartS = dyn_cast(AddRec->getStart())) { - // Handle (NewBase + offset) as start value. - const auto *Offset = dyn_cast(StartS->getOperand(0)); - const auto *NewBase = dyn_cast(StartS->getOperand(1)); - if (StartS->getNumOperands() == 2 && Offset && NewBase) { - // The following code below assumes the offset is unsigned, but GEP - // offsets are treated as signed so we can end up with a signed value - // here too. For example, suppose the initial PHI value is (i8 255), - // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). - if (Offset->getAPInt().isNegative()) - return false; + APInt AccessSize; + if (const SCEVUnknown *NewBase = dyn_cast(AccessStart)) { + Base = NewBase->getValue(); + AccessSize = MaxPtrDiff; + } else if (auto *MinAdd = dyn_cast(AccessStart)) { + if (MinAdd->getNumOperands() != 2) + return false; - // For the moment, restrict ourselves to the case where the offset is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (Offset->getAPInt().urem(Alignment.value()) != 0) - return false; - Base = NewBase->getValue(); - bool Overflow = false; - AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow); - if (Overflow) - return false; - } - } + const auto *Offset = dyn_cast(MinAdd->getOperand(0)); + const auto *NewBase = dyn_cast(MinAdd->getOperand(1)); + if (!Offset || !NewBase) + return false; - if (!Base) - return false; + // The following code below assumes the offset is unsigned, but GEP + // offsets are treated as signed so we can end up with a signed value + // here too. For example, suppose the initial PHI value is (i8 255), + // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). + if (Offset->getAPInt().isNegative()) + return false; - // For the moment, restrict ourselves to the case where the access size is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (EltSize.urem(Alignment.value()) != 0) + // For the moment, restrict ourselves to the case where the offset is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (Offset->getAPInt().urem(Alignment.value()) != 0) + return false; + + AccessSize = MaxPtrDiff + Offset->getAPInt(); + Base = NewBase->getValue(); + } else return false; + + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL, HeaderFirstNonPHI, AC, &DT); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2a68979add666..11e0a221fc887 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -190,31 +190,20 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( Members.push_back(Index); } -/// Calculate Start and End points of memory access. -/// Let's assume A is the first access and B is a memory access on N-th loop -/// iteration. Then B is calculated as: -/// B = A + Step*N . -/// Step value may be positive or negative. -/// N is a calculated back-edge taken count: -/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 -/// Start and End points are calculated in the following way: -/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, -/// where SizeOfElt is the size of single memory access in bytes. -/// -/// There is no conflict when the intervals are disjoint: -/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -static std::pair getStartAndEndForAccess( - const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, - PredicatedScalarEvolution &PSE, +std::pair llvm::getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, DenseMap, - std::pair> &PointerBounds) { - ScalarEvolution *SE = PSE.getSE(); - - auto [Iter, Ins] = PointerBounds.insert( - {{PtrExpr, AccessTy}, - {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); - if (!Ins) - return Iter->second; + std::pair> *PointerBounds) { + std::pair *PtrBoundsPair; + if (PointerBounds) { + auto [Iter, Ins] = PointerBounds->insert( + {{PtrExpr, AccessTy}, + {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); + if (!Ins) + return Iter->second; + PtrBoundsPair = &Iter->second; + } const SCEV *ScStart; const SCEV *ScEnd; @@ -222,10 +211,8 @@ static std::pair getStartAndEndForAccess( if (SE->isLoopInvariant(PtrExpr, Lp)) { ScStart = ScEnd = PtrExpr; } else if (auto *AR = dyn_cast(PtrExpr)) { - const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount(); - ScStart = AR->getStart(); - ScEnd = AR->evaluateAtIteration(Ex, *SE); + ScEnd = AR->evaluateAtIteration(MaxBECount, *SE); const SCEV *Step = AR->getStepRecurrence(*SE); // For expressions with negative step, the upper bound is ScStart and the @@ -244,7 +231,7 @@ static std::pair getStartAndEndForAccess( return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant"); - assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant"); + assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); // Add the size of the pointed element to ScEnd. auto &DL = Lp->getHeader()->getDataLayout(); @@ -252,8 +239,10 @@ static std::pair getStartAndEndForAccess( const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); - Iter->second = {ScStart, ScEnd}; - return Iter->second; + std::pair Res = {ScStart, ScEnd}; + if (PointerBounds) + *PtrBoundsPair = Res; + return Res; } /// Calculate Start and End points of memory access using @@ -263,8 +252,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze) { + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( - Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds()); + Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -1938,10 +1928,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( // required for correctness. if (SE.isLoopInvariant(Src, InnermostLoop) || SE.isLoopInvariant(Sink, InnermostLoop)) { - const auto &[SrcStart_, SrcEnd_] = - getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds); - const auto &[SinkStart_, SinkEnd_] = - getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds); + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); + const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( + InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds); + const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( + InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 1433e48690bc6..3e50ee42866b9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -2920,8 +2920,8 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { -; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes( +define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { +; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [103 x i32], align 4 ; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) @@ -2929,11 +2929,11 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 @@ -2999,170 +2999,74 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] -; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] -; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] -; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] -; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] -; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] -; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] -; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] -; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] -; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] -; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] -; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] -; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] -; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] -; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] -; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] -; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] -; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] -; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] -; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] -; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] -; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] -; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] -; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] -; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] -; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] -; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] -; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 -; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP144]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP145]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP146]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP147]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 +; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 +; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP112]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP113]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] +; CHECK-NEXT: [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] +; CHECK-NEXT: [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 -; CHECK-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP145]], [[TMP144]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]] +; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -3181,7 +3085,7 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 0f4e327891899..cbc483fabc184 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -351,27 +351,30 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if1: +; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1 ; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue2: +; CHECK: pred.store.continue4: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -665,12 +668,15 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 @@ -680,9 +686,6 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 From b92e97bdd598f1d4676945c3e87d40404a367327 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Wed, 15 Jan 2025 12:50:20 +0000 Subject: [PATCH 61/82] [test] Pre-commit llvm.experimental.memset.pattern tests prior to MemoryLocation improvements Reviewed as part of . --- llvm/test/Analysis/BasicAA/memset-pattern.ll | 18 +++++ .../memory-intrinsics-sizes.ll | 76 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 llvm/test/Analysis/BasicAA/memset-pattern.ll diff --git a/llvm/test/Analysis/BasicAA/memset-pattern.ll b/llvm/test/Analysis/BasicAA/memset-pattern.ll new file mode 100644 index 0000000000000..33d3d125b5794 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/memset-pattern.ll @@ -0,0 +1,18 @@ +; RUN: opt -mtriple=x86_64 -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s + +define void @test_memset_pattern4_const_size(ptr noalias %a, i32 %pattern) { +; CHECK-LABEL: Function: test_memset_pattern4_const_size +; CHECK: Just Mod: Ptr: i8* %a <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) +; CHECK-NEXT: Just Mod: Ptr: i8* %a.gep.1 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) +; CHECK-NEXT: Just Mod: Ptr: i8* %a.gep.129 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) + +entry: + load i8, ptr %a + call void @llvm.experimental.memset.pattern(ptr %a, i32 %pattern, i64 17, i1 0) + %a.gep.1 = getelementptr i8, ptr %a, i32 1 + store i8 0, ptr %a.gep.1 + %a.gep.129 = getelementptr i8, ptr %a, i32 129 + store i8 1, ptr %a.gep.129 + + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll index 09d8bbf3c93bc..10afbdc432ad5 100644 --- a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll +++ b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll @@ -66,6 +66,82 @@ define void @memset_and_store_2(ptr %ptr, i64 %len) { ret void } +define void @memset_pattern_equal_size_values(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_equal_size_values( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_1(ptr %ptr, i64 %len.1, i64 %len.2) { +; CHECK-LABEL: @memset_pattern_different_size_values_1( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN_1:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN_2:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len.1, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len.2, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_2(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_different_size_values_2( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 100, i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 100, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_3(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_different_size_values_3( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 100, i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 100, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_and_store_1(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_and_store_1( +; CHECK-NEXT: store i64 123, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + store i64 123, ptr %ptr + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_and_store_2(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_and_store_2( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: store i64 123, ptr [[PTR]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + store i64 123, ptr %ptr + ret void +} + +define void @memset_pattern_and_store_3(ptr %ptr) { +; CHECK-LABEL: @memset_pattern_and_store_3( +; CHECK-NEXT: store i64 0, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 13, i1 false) +; CHECK-NEXT: ret void +; + store i64 0, ptr %ptr + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 13, i1 false) + ret void +} + define void @memcpy_equal_size_values(ptr noalias %src, ptr noalias %dst, i64 %len) { ; CHECK-LABEL: @memcpy_equal_size_values( ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]], i1 false) From da4551aad0c2f263dca5b1bbc9a2fe52527047f5 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Wed, 15 Jan 2025 13:31:14 +0000 Subject: [PATCH 62/82] [compiler-rt][sanitizer_common] Fix for solaris and *BSD platforms proposal. (#122956) To fix llvm#122795 build failures for these. --- .../lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp | 1 + .../lib/sanitizer_common/sanitizer_platform_limits_freebsd.h | 1 + .../lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp | 1 + .../lib/sanitizer_common/sanitizer_platform_limits_netbsd.h | 1 + .../lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp | 3 +++ .../lib/sanitizer_common/sanitizer_platform_limits_solaris.h | 2 ++ 6 files changed, 9 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp index b25b45e021744..4940062eeae47 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp @@ -123,6 +123,7 @@ unsigned pid_t_sz = sizeof(pid_t); unsigned timeval_sz = sizeof(timeval); unsigned uid_t_sz = sizeof(uid_t); unsigned gid_t_sz = sizeof(gid_t); +unsigned fpos_t_sz = sizeof(fpos_t); unsigned mbstate_t_sz = sizeof(mbstate_t); unsigned sigset_t_sz = sizeof(sigset_t); unsigned struct_timezone_sz = sizeof(struct timezone); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h index 3942f1523437f..8ce73f206fd88 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h @@ -46,6 +46,7 @@ extern unsigned pid_t_sz; extern unsigned timeval_sz; extern unsigned uid_t_sz; extern unsigned gid_t_sz; +extern unsigned fpos_t_sz; extern unsigned mbstate_t_sz; extern unsigned struct_timezone_sz; extern unsigned struct_tms_sz; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp index ecabbf0d08e2c..aacd28c55ceaa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp @@ -547,6 +547,7 @@ unsigned pid_t_sz = sizeof(pid_t); unsigned timeval_sz = sizeof(timeval); unsigned uid_t_sz = sizeof(uid_t); unsigned gid_t_sz = sizeof(gid_t); +unsigned fpos_t_sz = sizeof(fpos_t); unsigned mbstate_t_sz = sizeof(mbstate_t); unsigned sigset_t_sz = sizeof(sigset_t); unsigned struct_timezone_sz = sizeof(struct timezone); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h index 4f892577d0b00..3758a9101c2a0 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h @@ -36,6 +36,7 @@ extern unsigned pid_t_sz; extern unsigned timeval_sz; extern unsigned uid_t_sz; extern unsigned gid_t_sz; +extern unsigned fpos_t_sz; extern unsigned mbstate_t_sz; extern unsigned struct_timezone_sz; extern unsigned struct_tms_sz; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp index dad7bde1498a7..7ea6134b702bf 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,8 @@ namespace __sanitizer { unsigned struct_sioc_sg_req_sz = sizeof(struct sioc_sg_req); unsigned struct_sioc_vif_req_sz = sizeof(struct sioc_vif_req); + unsigned fpos_t_sz = sizeof(fpos_t); + const unsigned IOCTL_NOT_PRESENT = 0; unsigned IOCTL_FIOASYNC = FIOASYNC; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h index 84a81265162c6..bf6586d27228f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h @@ -418,6 +418,8 @@ extern unsigned struct_winsize_sz; extern unsigned struct_sioc_sg_req_sz; extern unsigned struct_sioc_vif_req_sz; +extern unsigned fpos_t_sz; + // ioctl request identifiers // A special value to mark ioctls that are not present on the target platform, From e00d1dd6eaf46cf17080cdf506348ab8f037f6f2 Mon Sep 17 00:00:00 2001 From: Un1q32 Date: Wed, 15 Jan 2025 08:31:54 -0500 Subject: [PATCH 63/82] [ARM] Fix armv6kz LDREX definition (#122965) Fixes #37901 This behavior is consistent with GCC --- clang/lib/Basic/Targets/ARM.cpp | 3 ++- clang/test/Preprocessor/arm-acle-6.4.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 61ee26d886383..0fd5433a76402 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -617,7 +617,8 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, case 6: if (ArchProfile == llvm::ARM::ProfileKind::M) LDREX = 0; - else if (ArchKind == llvm::ARM::ArchKind::ARMV6K) + else if (ArchKind == llvm::ARM::ArchKind::ARMV6K || + ArchKind == llvm::ARM::ArchKind::ARMV6KZ) LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B; else LDREX = LDREX_W; diff --git a/clang/test/Preprocessor/arm-acle-6.4.c b/clang/test/Preprocessor/arm-acle-6.4.c index fcabe028b9559..2c8f4868263a6 100644 --- a/clang/test/Preprocessor/arm-acle-6.4.c +++ b/clang/test/Preprocessor/arm-acle-6.4.c @@ -93,6 +93,10 @@ // CHECK-V6K: __ARM_FEATURE_LDREX 0xf +// RUN: %clang -target arm-none-linux-eabi -march=armv6kz -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V6KZ + +// CHECK-V6KZ: __ARM_FEATURE_LDREX 0xf + // RUN: %clang -target arm-none-linux-eabi -march=armv7-a -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7A // CHECK-V7A: __ARM_ARCH 7 From d1314d0152f242c618caafce264fccbc47273d84 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Wed, 15 Jan 2025 13:50:23 +0000 Subject: [PATCH 64/82] [MemoryLocation] Teach MemoryLocation about llvm.experimental.memset.pattern (#120421) Relates to (but isn't dependent on) #120420. This allows alias analysis o the intrinsic of the same quality as for the libcall, which we want in order to move LoopIdiomRecognize over to selecting the intrinsic. --- llvm/lib/Analysis/MemoryLocation.cpp | 12 ++++++++++++ llvm/test/Analysis/BasicAA/memset-pattern.ll | 2 +- .../DeadStoreElimination/memory-intrinsics-sizes.ll | 3 +-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index b664b54c044f5..6e3232772706a 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -183,6 +183,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, AATags); return MemoryLocation::getAfter(Arg, AATags); + case Intrinsic::experimental_memset_pattern: + assert((ArgIdx == 0 || ArgIdx == 1) && + "Invalid argument index for memory intrinsic"); + if (ConstantInt *LenCI = dyn_cast(II->getArgOperand(2))) + return MemoryLocation( + Arg, + LocationSize::precise( + LenCI->getZExtValue() * + DL.getTypeAllocSize(II->getArgOperand(1)->getType())), + AATags); + return MemoryLocation::getAfter(Arg, AATags); + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::invariant_start: diff --git a/llvm/test/Analysis/BasicAA/memset-pattern.ll b/llvm/test/Analysis/BasicAA/memset-pattern.ll index 33d3d125b5794..aaa605db0eb26 100644 --- a/llvm/test/Analysis/BasicAA/memset-pattern.ll +++ b/llvm/test/Analysis/BasicAA/memset-pattern.ll @@ -4,7 +4,7 @@ define void @test_memset_pattern4_const_size(ptr noalias %a, i32 %pattern) { ; CHECK-LABEL: Function: test_memset_pattern4_const_size ; CHECK: Just Mod: Ptr: i8* %a <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) ; CHECK-NEXT: Just Mod: Ptr: i8* %a.gep.1 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) -; CHECK-NEXT: Just Mod: Ptr: i8* %a.gep.129 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) +; CHECK-NEXT: NoModRef: Ptr: i8* %a.gep.129 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) entry: load i8, ptr %a diff --git a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll index 10afbdc432ad5..947d8a788c244 100644 --- a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll +++ b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll @@ -133,8 +133,7 @@ define void @memset_pattern_and_store_2(ptr %ptr, i64 %len) { define void @memset_pattern_and_store_3(ptr %ptr) { ; CHECK-LABEL: @memset_pattern_and_store_3( -; CHECK-NEXT: store i64 0, ptr [[PTR:%.*]], align 4 -; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 13, i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 13, i1 false) ; CHECK-NEXT: ret void ; store i64 0, ptr %ptr From a32c45631b69eeb605f71de3b21ea9f2fba88e34 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 15 Jan 2025 14:52:10 +0100 Subject: [PATCH 65/82] [flang][OpenMP] Generalize fixing `alloca` IP pre-condition for `private` ops (#122866) This PR generalizes a fix that we implemented previously for `omp.wsloop`s. The fix makes sure the pre-condtion that the `alloca` block has a single successor whenever we inline delayed privatizers is respected. I simply moved the fix to `allocatePrivateVars` so that it kicks in for any op not just `omp.wsloop`. This handles a bug uncovered by [a test](https://github.com/OpenMP-Validation-and-Verification/OpenMP_VV/blob/master/tests/4.5/target_simd/test_target_simd_safelen.F90) in the OpenMP_VV test suite. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 17 ++++++---- .../LLVMIR/openmp-target-simd-on_device.mlir | 34 +++++++++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index abef2cb7411aa..c7dce5d6c6556 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1344,13 +1344,23 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::SmallVectorImpl &llvmPrivateVars, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, llvm::DenseMap *mappedPrivateVars = nullptr) { - llvm::IRBuilderBase::InsertPointGuard guard(builder); // Allocate private vars llvm::BranchInst *allocaTerminator = llvm::cast(allocaIP.getBlock()->getTerminator()); + if (allocaTerminator->getNumSuccessors() != 1) { + splitBB(llvm::OpenMPIRBuilder::InsertPointTy( + allocaIP.getBlock(), allocaTerminator->getIterator()), + true, "omp.region.after_alloca"); + } + + llvm::IRBuilderBase::InsertPointGuard guard(builder); + // Update the allocaTerminator in case the alloca block was split above. + allocaTerminator = + llvm::cast(allocaIP.getBlock()->getTerminator()); builder.SetInsertPoint(allocaTerminator); assert(allocaTerminator->getNumSuccessors() == 1 && "This is an unconditional branch created by OpenMPIRBuilder"); + llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); // FIXME: Some of the allocation regions do more than just allocating. @@ -1880,11 +1890,6 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, SmallVector privateReductionVariables( wsloopOp.getNumReductionVars()); - splitBB(llvm::OpenMPIRBuilder::InsertPointTy( - allocaIP.getBlock(), - allocaIP.getBlock()->getTerminator()->getIterator()), - true, "omp.region.after_alloca"); - llvm::Expected afterAllocas = allocatePrivateVars( builder, moduleTranslation, privateBlockArgs, privateDecls, mlirPrivateVars, llvmPrivateVars, allocaIP); diff --git a/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir b/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir new file mode 100644 index 0000000000000..0ce90578ea9d6 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = true} { + omp.private {type = private} @simd_privatizer : !llvm.ptr alloc { + ^bb0(%arg0: !llvm.ptr): + omp.yield(%arg0 : !llvm.ptr) + } + + llvm.func @test_target_simd() { + omp.target { + %5 = llvm.mlir.constant(1 : i32) : i32 + %x = llvm.alloca %5 x i32 {bindc_name = "x"} : (i32) -> !llvm.ptr + omp.simd private(@simd_privatizer %x -> %arg1 : !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%5) to (%5) inclusive step (%5) { + omp.yield + } + } + omp.terminator + } + llvm.return + } + +} + +// CHECK-LABEL: define {{.*}} @__omp_offloading_{{.*}}_test_target_simd_{{.*}} + +// CHECK: %[[INT:.*]] = alloca i32, align 4 +// CHECK: br label %[[LATE_ALLOC_BB:.*]] + +// CHECK: [[LATE_ALLOC_BB]]: +// CHECK: br label %[[AFTER_ALLOC_BB:.*]] + +// CHECK: [[AFTER_ALLOC_BB]]: +// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} From 6ca560a9092e29c9f9817db6d6da09edd5f0ded7 Mon Sep 17 00:00:00 2001 From: Boaz Brickner Date: Wed, 15 Jan 2025 14:56:07 +0100 Subject: [PATCH 66/82] [clang] Add support for passing FileSystem to buildASTFromCodeWithArgs() (#123042) This would allow tools that don't use the real file system to use this function. --- clang/include/clang/Tooling/Tooling.h | 10 ++++++++-- clang/lib/Tooling/Tooling.cpp | 5 +++-- clang/unittests/Tooling/ToolingTest.cpp | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Tooling/Tooling.h b/clang/include/clang/Tooling/Tooling.h index 070706e8fa6d1..200fb30839a95 100644 --- a/clang/include/clang/Tooling/Tooling.h +++ b/clang/include/clang/Tooling/Tooling.h @@ -223,7 +223,11 @@ buildASTFromCode(StringRef Code, StringRef FileName = "input.cc", /// \param PCHContainerOps The PCHContainerOperations for loading and creating /// clang modules. /// -/// \param Adjuster A function to filter the command line arguments as specified. +/// \param Adjuster A function to filter the command line arguments as +/// specified. +/// +/// \param BaseFS FileSystem for managing and looking up files. +/// VirtualMappedFiles takes precedence. /// /// \return The resulting AST or null if an error occurred. std::unique_ptr buildASTFromCodeWithArgs( @@ -233,7 +237,9 @@ std::unique_ptr buildASTFromCodeWithArgs( std::make_shared(), ArgumentsAdjuster Adjuster = getClangStripDependencyFileAdjuster(), const FileContentMappings &VirtualMappedFiles = FileContentMappings(), - DiagnosticConsumer *DiagConsumer = nullptr); + DiagnosticConsumer *DiagConsumer = nullptr, + IntrusiveRefCntPtr BaseFS = + llvm::vfs::getRealFileSystem()); /// Utility to run a FrontendAction in a single clang invocation. class ToolInvocation { diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 88b7349ce8fed..03523c3f17eda 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -692,11 +692,12 @@ std::unique_ptr buildASTFromCodeWithArgs( StringRef Code, const std::vector &Args, StringRef FileName, StringRef ToolName, std::shared_ptr PCHContainerOps, ArgumentsAdjuster Adjuster, const FileContentMappings &VirtualMappedFiles, - DiagnosticConsumer *DiagConsumer) { + DiagnosticConsumer *DiagConsumer, + IntrusiveRefCntPtr BaseFS) { std::vector> ASTs; ASTBuilderAction Action(ASTs); llvm::IntrusiveRefCntPtr OverlayFileSystem( - new llvm::vfs::OverlayFileSystem(llvm::vfs::getRealFileSystem())); + new llvm::vfs::OverlayFileSystem(std::move(BaseFS))); llvm::IntrusiveRefCntPtr InMemoryFileSystem( new llvm::vfs::InMemoryFileSystem); OverlayFileSystem->pushOverlay(InMemoryFileSystem); diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp index 0b65577a05193..8cdfffb54390e 100644 --- a/clang/unittests/Tooling/ToolingTest.cpp +++ b/clang/unittests/Tooling/ToolingTest.cpp @@ -152,6 +152,20 @@ TEST(buildASTFromCode, ReportsErrors) { EXPECT_EQ(1u, Consumer.NumDiagnosticsSeen); } +TEST(buildASTFromCode, FileSystem) { + llvm::IntrusiveRefCntPtr InMemoryFileSystem( + new llvm::vfs::InMemoryFileSystem); + InMemoryFileSystem->addFile("included_file.h", 0, + llvm::MemoryBuffer::getMemBufferCopy("class X;")); + std::unique_ptr AST = buildASTFromCodeWithArgs( + R"(#include "included_file.h")", {}, "input.cc", "clang-tool", + std::make_shared(), + getClangStripDependencyFileAdjuster(), FileContentMappings(), nullptr, + InMemoryFileSystem); + ASSERT_TRUE(AST.get()); + EXPECT_TRUE(FindClassDeclX(AST.get())); +} + TEST(newFrontendActionFactory, CreatesFrontendActionFactoryFromType) { std::unique_ptr Factory( newFrontendActionFactory()); From a00938eedd4246c4252ce3e69a05c4b6760983a3 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 15 Jan 2025 13:56:42 +0000 Subject: [PATCH 67/82] Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" (#123057) This reverts commit bfedf6460c2cad6e6f966b457d8d27084579dcd8. --- .../llvm/Analysis/LoopAccessAnalysis.h | 19 -- llvm/lib/Analysis/Loads.cpp | 109 ++++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 61 +++-- .../LoopVectorize/X86/load-deref-pred.ll | 238 ++++++++++++------ .../LoopVectorize/load-deref-pred-align.ll | 27 +- 5 files changed, 266 insertions(+), 188 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 6fc6ca14d0889..31374a128856c 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,25 +853,6 @@ bool sortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType = true); -/// Calculate Start and End points of memory access. -/// Let's assume A is the first access and B is a memory access on N-th loop -/// iteration. Then B is calculated as: -/// B = A + Step*N . -/// Step value may be positive or negative. -/// N is a calculated back-edge taken count: -/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 -/// Start and End points are calculated in the following way: -/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, -/// where SizeOfElt is the size of single memory access in bytes. -/// -/// There is no conflict when the intervals are disjoint: -/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -std::pair getStartAndEndForAccess( - const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, - ScalarEvolution *SE, - DenseMap, - std::pair> *PointerBounds); - class LoopAccessInfoManager { /// The cache. DenseMap> LoopAccessInfoMap; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index cc6760292c2ff..7bbd469bd035d 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -13,7 +13,6 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" -#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" @@ -276,88 +275,84 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { bool llvm::isDereferenceableAndAlignedInLoop( LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC, SmallVectorImpl *Predicates) { - const Align Alignment = LI->getAlign(); auto &DL = LI->getDataLayout(); Value *Ptr = LI->getPointerOperand(); + APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), DL.getTypeStoreSize(LI->getType()).getFixedValue()); + const Align Alignment = LI->getAlign(); + + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); // If given a uniform (i.e. non-varying) address, see if we can prove the // access is safe within the loop w/o needing predication. if (L->isLoopInvariant(Ptr)) - return isDereferenceableAndAlignedPointer( - Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT); - - const SCEV *PtrScev = SE.getSCEV(Ptr); - auto *AddRec = dyn_cast(PtrScev); + return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL, + HeaderFirstNonPHI, AC, &DT); - // Check to see if we have a repeating access pattern and it's possible - // to prove all accesses are well aligned. + // Otherwise, check to see if we have a repeating access pattern where we can + // prove that all accesses are well aligned and dereferenceable. + auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) return false; - auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); if (!Step) return false; - // For the moment, restrict ourselves to the case where the access size is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (EltSize.urem(Alignment.value()) != 0) + auto TC = SE.getSmallConstantMaxTripCount(L, Predicates); + if (!TC) return false; // TODO: Handle overlapping accesses. - if (EltSize.ugt(Step->getAPInt().abs())) - return false; - - const SCEV *MaxBECount = - SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates); - if (isa(MaxBECount)) - return false; - - const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr); - if (isa(AccessStart) || - isa(AccessEnd)) + // We should be computing AccessSize as (TC - 1) * Step + EltSize. + if (EltSize.sgt(Step->getAPInt())) return false; - // Try to get the access size. - const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); - APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); + // Compute the total access size for access patterns with unit stride and + // patterns with gaps. For patterns with unit stride, Step and EltSize are the + // same. + // For patterns with gaps (i.e. non unit stride), we are + // accessing EltSize bytes at every Step. + APInt AccessSize = TC * Step->getAPInt(); + assert(SE.isLoopInvariant(AddRec->getStart(), L) && + "implied by addrec definition"); Value *Base = nullptr; - APInt AccessSize; - if (const SCEVUnknown *NewBase = dyn_cast(AccessStart)) { - Base = NewBase->getValue(); - AccessSize = MaxPtrDiff; - } else if (auto *MinAdd = dyn_cast(AccessStart)) { - if (MinAdd->getNumOperands() != 2) - return false; - - const auto *Offset = dyn_cast(MinAdd->getOperand(0)); - const auto *NewBase = dyn_cast(MinAdd->getOperand(1)); - if (!Offset || !NewBase) - return false; - - // The following code below assumes the offset is unsigned, but GEP - // offsets are treated as signed so we can end up with a signed value - // here too. For example, suppose the initial PHI value is (i8 255), - // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). - if (Offset->getAPInt().isNegative()) - return false; + if (auto *StartS = dyn_cast(AddRec->getStart())) { + Base = StartS->getValue(); + } else if (auto *StartS = dyn_cast(AddRec->getStart())) { + // Handle (NewBase + offset) as start value. + const auto *Offset = dyn_cast(StartS->getOperand(0)); + const auto *NewBase = dyn_cast(StartS->getOperand(1)); + if (StartS->getNumOperands() == 2 && Offset && NewBase) { + // The following code below assumes the offset is unsigned, but GEP + // offsets are treated as signed so we can end up with a signed value + // here too. For example, suppose the initial PHI value is (i8 255), + // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). + if (Offset->getAPInt().isNegative()) + return false; - // For the moment, restrict ourselves to the case where the offset is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (Offset->getAPInt().urem(Alignment.value()) != 0) - return false; + // For the moment, restrict ourselves to the case where the offset is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (Offset->getAPInt().urem(Alignment.value()) != 0) + return false; + Base = NewBase->getValue(); + bool Overflow = false; + AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow); + if (Overflow) + return false; + } + } - AccessSize = MaxPtrDiff + Offset->getAPInt(); - Base = NewBase->getValue(); - } else + if (!Base) return false; - Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); + // For the moment, restrict ourselves to the case where the access size is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (EltSize.urem(Alignment.value()) != 0) + return false; return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL, HeaderFirstNonPHI, AC, &DT); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 11e0a221fc887..2a68979add666 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -190,20 +190,31 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( Members.push_back(Index); } -std::pair llvm::getStartAndEndForAccess( - const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, - ScalarEvolution *SE, +/// Calculate Start and End points of memory access. +/// Let's assume A is the first access and B is a memory access on N-th loop +/// iteration. Then B is calculated as: +/// B = A + Step*N . +/// Step value may be positive or negative. +/// N is a calculated back-edge taken count: +/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 +/// Start and End points are calculated in the following way: +/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, +/// where SizeOfElt is the size of single memory access in bytes. +/// +/// There is no conflict when the intervals are disjoint: +/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) +static std::pair getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, + PredicatedScalarEvolution &PSE, DenseMap, - std::pair> *PointerBounds) { - std::pair *PtrBoundsPair; - if (PointerBounds) { - auto [Iter, Ins] = PointerBounds->insert( - {{PtrExpr, AccessTy}, - {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); - if (!Ins) - return Iter->second; - PtrBoundsPair = &Iter->second; - } + std::pair> &PointerBounds) { + ScalarEvolution *SE = PSE.getSE(); + + auto [Iter, Ins] = PointerBounds.insert( + {{PtrExpr, AccessTy}, + {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); + if (!Ins) + return Iter->second; const SCEV *ScStart; const SCEV *ScEnd; @@ -211,8 +222,10 @@ std::pair llvm::getStartAndEndForAccess( if (SE->isLoopInvariant(PtrExpr, Lp)) { ScStart = ScEnd = PtrExpr; } else if (auto *AR = dyn_cast(PtrExpr)) { + const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount(); + ScStart = AR->getStart(); - ScEnd = AR->evaluateAtIteration(MaxBECount, *SE); + ScEnd = AR->evaluateAtIteration(Ex, *SE); const SCEV *Step = AR->getStepRecurrence(*SE); // For expressions with negative step, the upper bound is ScStart and the @@ -231,7 +244,7 @@ std::pair llvm::getStartAndEndForAccess( return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant"); - assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); + assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant"); // Add the size of the pointed element to ScEnd. auto &DL = Lp->getHeader()->getDataLayout(); @@ -239,10 +252,8 @@ std::pair llvm::getStartAndEndForAccess( const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); - std::pair Res = {ScStart, ScEnd}; - if (PointerBounds) - *PtrBoundsPair = Res; - return Res; + Iter->second = {ScStart, ScEnd}; + return Iter->second; } /// Calculate Start and End points of memory access using @@ -252,9 +263,8 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze) { - const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( - Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds()); + Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -1928,11 +1938,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( // required for correctness. if (SE.isLoopInvariant(Src, InnermostLoop) || SE.isLoopInvariant(Sink, InnermostLoop)) { - const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); - const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( - InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds); - const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( - InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds); + const auto &[SrcStart_, SrcEnd_] = + getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds); + const auto &[SinkStart_, SinkEnd_] = + getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 3e50ee42866b9..1433e48690bc6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -2920,8 +2920,8 @@ loop_exit: ret i32 %accum.next } -define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { -; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes( +define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { +; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [103 x i32], align 4 ; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) @@ -2929,11 +2929,11 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 @@ -2999,74 +2999,170 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4 -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 -; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 -; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 -; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 -; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 -; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 -; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 -; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 -; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP112]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP113]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] -; CHECK-NEXT: [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] -; CHECK-NEXT: [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 +; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; CHECK: pred.load.if4: +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] +; CHECK: pred.load.continue5: +; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] +; CHECK: pred.load.if6: +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] +; CHECK: pred.load.continue7: +; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; CHECK: pred.load.if8: +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; CHECK: pred.load.continue9: +; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 +; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK: pred.load.if10: +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; CHECK: pred.load.continue11: +; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 +; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK: pred.load.if12: +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; CHECK: pred.load.continue13: +; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK: pred.load.if14: +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; CHECK: pred.load.continue15: +; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 +; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK: pred.load.if16: +; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; CHECK: pred.load.continue17: +; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 +; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK: pred.load.if18: +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; CHECK: pred.load.continue19: +; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 +; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK: pred.load.if20: +; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; CHECK: pred.load.continue21: +; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 +; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK: pred.load.if22: +; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 +; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] +; CHECK: pred.load.continue23: +; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] +; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 +; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK: pred.load.if24: +; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] +; CHECK: pred.load.continue25: +; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] +; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 +; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK: pred.load.if26: +; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 +; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] +; CHECK: pred.load.continue27: +; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] +; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 +; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK: pred.load.if28: +; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 +; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] +; CHECK: pred.load.continue29: +; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 +; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK: pred.load.if30: +; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 +; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] +; CHECK: pred.load.continue31: +; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 +; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.if32: +; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 +; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.continue33: +; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP144]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP145]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] +; CHECK-NEXT: [[TMP146]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] +; CHECK-NEXT: [[TMP147]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 -; CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]] -; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]] -; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP145]], [[TMP144]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]] +; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -3085,7 +3181,7 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index cbc483fabc184..0f4e327891899 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -351,30 +351,27 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if3: +; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1 ; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: +; CHECK: pred.store.continue2: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -668,15 +665,12 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 @@ -686,6 +680,9 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 From cf2e828925dc8c9656e800387820b49be03109d6 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 15 Jan 2025 14:19:23 +0000 Subject: [PATCH 68/82] SCEV: regen some tests with UTC (#123050) While at it, move a test that calls the IndVarSimplify pass into the IndVarSimplify directory. --- .../ScalarEvolution/2007-08-06-Unsigned.ll | 37 +++-- .../ScalarEvolution/implied-via-addition.ll | 50 ------ .../ScalarEvolution/implied-via-division.ll | 148 ++++++++++++++---- .../ScalarEvolution/infer-prestart-no-wrap.ll | 115 ++++++++++++-- .../IndVarSimplify/implied-via-addition.ll | 78 +++++++++ 5 files changed, 314 insertions(+), 114 deletions(-) delete mode 100644 llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll create mode 100644 llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll diff --git a/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll b/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll index 22404b102a73f..26f60c00ae1bf 100644 --- a/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll +++ b/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll @@ -1,32 +1,39 @@ -; RUN: opt < %s "-passes=print" -disable-output 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes="print" -disable-output \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s ; PR1597 -; CHECK: Loop %bb: backedge-taken count is (-1 + (-1 * %x) + %y) - define i32 @f(i32 %x, i32 %y) { +; CHECK-LABEL: 'f' +; CHECK-NEXT: Determining loop execution counts for: @f +; CHECK-NEXT: Loop %bb: backedge-taken count is (-1 + (-1 * %x) + %y) +; CHECK-NEXT: Loop %bb: constant max backedge-taken count is i32 -1 +; CHECK-NEXT: Loop %bb: symbolic max backedge-taken count is (-1 + (-1 * %x) + %y) +; CHECK-NEXT: Loop %bb: Trip multiple is 1 +; entry: - %tmp63 = icmp ult i32 %x, %y ; [#uses=1] - br i1 %tmp63, label %bb.preheader, label %bb8 + %tmp63 = icmp ult i32 %x, %y ; [#uses=1] + br i1 %tmp63, label %bb.preheader, label %bb8 bb.preheader: ; preds = %entry - br label %bb + br label %bb bb: ; preds = %bb3, %bb.preheader - %x_addr.0 = phi i32 [ %tmp2, %bb3 ], [ %x, %bb.preheader ] ; [#uses=1] - %tmp2 = add i32 %x_addr.0, 1 ; [#uses=3] - br label %bb3 + %x_addr.0 = phi i32 [ %tmp2, %bb3 ], [ %x, %bb.preheader ] ; [#uses=1] + %tmp2 = add i32 %x_addr.0, 1 ; [#uses=3] + br label %bb3 bb3: ; preds = %bb - %tmp6 = icmp ult i32 %tmp2, %y ; [#uses=1] - br i1 %tmp6, label %bb, label %bb8.loopexit + %tmp6 = icmp ult i32 %tmp2, %y ; [#uses=1] + br i1 %tmp6, label %bb, label %bb8.loopexit bb8.loopexit: ; preds = %bb3 - br label %bb8 + br label %bb8 bb8: ; preds = %bb8.loopexit, %entry - %x_addr.1 = phi i32 [ %x, %entry ], [ %tmp2, %bb8.loopexit ] ; [#uses=1] - br label %return + %x_addr.1 = phi i32 [ %x, %entry ], [ %tmp2, %bb8.loopexit ] ; [#uses=1] + br label %return return: ; preds = %bb8 - ret i32 %x_addr.1 + ret i32 %x_addr.1 } diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll deleted file mode 100644 index 4a0ebf810568e..0000000000000 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt -passes=indvars -S < %s | FileCheck %s - -declare void @use(i1) - -declare void @llvm.experimental.guard(i1, ...) - -define void @test_01(i8 %t) { -; CHECK-LABEL: test_01 - entry: - %st = sext i8 %t to i16 - %cmp1 = icmp slt i16 %st, 42 - call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] - br label %loop - - loop: -; CHECK-LABEL: loop - %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ] - %idx.inc = add i8 %idx, 1 - %c = icmp slt i8 %idx, 42 -; CHECK: call void @use(i1 true) - call void @use(i1 %c) - %be = icmp slt i8 %idx.inc, 42 - br i1 %be, label %loop, label %exit - - exit: - ret void -} - -define void @test_02(i8 %t) { -; CHECK-LABEL: test_02 - entry: - %t.ptr = inttoptr i8 %t to ptr - %p.42 = inttoptr i8 42 to ptr - %cmp1 = icmp slt ptr %t.ptr, %p.42 - call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] - br label %loop - - loop: -; CHECK-LABEL: loop - %idx = phi ptr [ %t.ptr, %entry ], [ %snext, %loop ] - %snext = getelementptr inbounds i8, ptr %idx, i64 1 - %c = icmp slt ptr %idx, %p.42 -; CHECK: call void @use(i1 true) - call void @use(i1 %c) - %be = icmp slt ptr %snext, %p.42 - br i1 %be, label %loop, label %exit - - exit: - ret void -} diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll index 6d38f510c4997..fbe69b4b18897 100644 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll @@ -1,11 +1,18 @@ -; RUN: opt < %s -disable-output "-passes=print" 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -disable-output -passes="print" \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s declare void @llvm.experimental.guard(i1, ...) define void @test_1(i32 %n) nounwind { ; Prove that (n > 1) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_1 -; CHECK: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-LABEL: 'test_1' +; CHECK-NEXT: Determining loop execution counts for: @test_1 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -24,8 +31,13 @@ exit: define void @test_1neg(i32 %n) nounwind { ; Prove that (n > 0) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_1neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-LABEL: 'test_1neg' +; CHECK-NEXT: Determining loop execution counts for: @test_1neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 0 %n.div.2 = sdiv i32 %n, 2 @@ -44,8 +56,13 @@ exit: define void @test_2(i32 %n) nounwind { ; Prove that (n >= 2) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_2 -; CHECK: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-LABEL: 'test_2' +; CHECK-NEXT: Determining loop execution counts for: @test_2 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 2 %n.div.2 = sdiv i32 %n, 2 @@ -64,8 +81,13 @@ exit: define void @test_2neg(i32 %n) nounwind { ; Prove that (n >= 1) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_2neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-LABEL: 'test_2neg' +; CHECK-NEXT: Determining loop execution counts for: @test_2neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -84,8 +106,13 @@ exit: define void @test_3(i32 %n) nounwind { ; Prove that (n > -2) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_3 -; CHECK: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-LABEL: 'test_3' +; CHECK-NEXT: Determining loop execution counts for: @test_3 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -104,8 +131,13 @@ exit: define void @test_3neg(i32 %n) nounwind { ; Prove that (n > -3) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_3neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-LABEL: 'test_3neg' +; CHECK-NEXT: Determining loop execution counts for: @test_3neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -3 %n.div.2 = sdiv i32 %n, 2 @@ -124,8 +156,13 @@ exit: define void @test_4(i32 %n) nounwind { ; Prove that (n >= -1) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_4 -; CHECK: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-LABEL: 'test_4' +; CHECK-NEXT: Determining loop execution counts for: @test_4 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -1 %n.div.2 = sdiv i32 %n, 2 @@ -144,8 +181,13 @@ exit: define void @test_4neg(i32 %n) nounwind { ; Prove that (n >= -2) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_4neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-LABEL: 'test_4neg' +; CHECK-NEXT: Determining loop execution counts for: @test_4neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -164,8 +206,13 @@ exit: define void @test_ext_01(i32 %n) nounwind { ; Prove that (n > 1) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_01 -; CHECK: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_01' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_01 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -185,8 +232,13 @@ exit: define void @test_ext_01neg(i32 %n) nounwind { ; Prove that (n > 0) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_01neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_01neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_01neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 0 %n.div.2 = sdiv i32 %n, 2 @@ -206,8 +258,13 @@ exit: define void @test_ext_02(i32 %n) nounwind { ; Prove that (n >= 2) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_02 -; CHECK: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_02' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_02 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 2 %n.div.2 = sdiv i32 %n, 2 @@ -227,8 +284,13 @@ exit: define void @test_ext_02neg(i32 %n) nounwind { ; Prove that (n >= 1) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_02neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_02neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_02neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -248,8 +310,13 @@ exit: define void @test_ext_03(i32 %n) nounwind { ; Prove that (n > -2) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_03 -; CHECK: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_03' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_03 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -269,8 +336,13 @@ exit: define void @test_ext_03neg(i32 %n) nounwind { ; Prove that (n > -3) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_03neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_03neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_03neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -3 %n.div.2 = sdiv i32 %n, 2 @@ -290,8 +362,13 @@ exit: define void @test_ext_04(i32 %n) nounwind { ; Prove that (n >= -1) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_04 -; CHECK: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_04' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_04 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -1 %n.div.2 = sdiv i32 %n, 2 @@ -311,8 +388,13 @@ exit: define void @test_ext_04neg(i32 %n) nounwind { ; Prove that (n >= -2) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_04neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_04neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_04neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 diff --git a/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll b/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll index a8b891b5afb23..677463ee63225 100644 --- a/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll +++ b/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll @@ -1,7 +1,29 @@ -; ; RUN: opt -disable-output "-passes=print" < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -disable-output -passes="print" < %s 2>&1 | FileCheck %s define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.0 +; CHECK-LABEL: 'infer.sext.0' +; CHECK-NEXT: Classifying expressions for: @infer.sext.0 +; CHECK-NEXT: %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ] +; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,2) S: [0,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {%start,+,1}<%loop> U: full-set S: full-set Exits: (1 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i32 %idx, 1 +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: (2 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc.sext = sext i32 %idx.inc to i64 +; CHECK-NEXT: --> {(1 + (sext i32 %start to i64)),+,1}<%loop> U: [-2147483647,2147483650) S: [-2147483647,2147483650) Exits: (2 + (sext i32 %start to i64)) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc +; CHECK-NEXT: --> {(4 + (4 * (sext i32 %start to i64)) + %buf),+,4}<%loop> U: full-set S: full-set Exits: (8 + (4 * (sext i32 %start to i64)) + %buf) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %val = load i32, ptr %buf.gep, align 4 +; CHECK-NEXT: --> %val U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: %counter.inc = add i32 %counter, 1 +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,3) S: [1,3) Exits: 2 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.0 +; CHECK-NEXT: Loop %loop: backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: Trip multiple is 2 +; entry: br label %loop @@ -10,8 +32,6 @@ define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] %idx.inc = add nsw i32 %idx, 1 %idx.inc.sext = sext i32 %idx.inc to i64 -; CHECK: %idx.inc.sext = sext i32 %idx.inc to i64 -; CHECK-NEXT: --> {(1 + (sext i32 %start to i64)),+,1}<%loop> %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc %val = load i32, ptr %buf.gep @@ -25,7 +45,28 @@ define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { } define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { -; CHECK-LABEL: Classifying expressions for: @infer.zext.0 +; CHECK-LABEL: 'infer.zext.0' +; CHECK-NEXT: Classifying expressions for: @infer.zext.0 +; CHECK-NEXT: %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ] +; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,2) S: [0,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {%start,+,1}<%loop> U: full-set S: full-set Exits: (1 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nuw i32 %idx, 1 +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: [1,0) S: [1,0) Exits: (2 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc.sext = zext i32 %idx.inc to i64 +; CHECK-NEXT: --> {(1 + (zext i32 %start to i64)),+,1}<%loop> U: [1,4294967298) S: [1,4294967298) Exits: (2 + (zext i32 %start to i64)) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc +; CHECK-NEXT: --> ((4 * (sext i32 {(1 + %start),+,1}<%loop> to i64)) + %buf) U: full-set S: full-set Exits: ((4 * (sext i32 (2 + %start) to i64)) + %buf) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %val = load i32, ptr %buf.gep, align 4 +; CHECK-NEXT: --> %val U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: %counter.inc = add i32 %counter, 1 +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,3) S: [1,3) Exits: 2 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @infer.zext.0 +; CHECK-NEXT: Loop %loop: backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: Trip multiple is 2 +; entry: br label %loop @@ -34,8 +75,6 @@ define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] %idx.inc = add nuw i32 %idx, 1 %idx.inc.sext = zext i32 %idx.inc to i64 -; CHECK: %idx.inc.sext = zext i32 %idx.inc to i64 -; CHECK-NEXT: --> {(1 + (zext i32 %start to i64)),+,1}<%loop> %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc %val = load i32, ptr %buf.gep @@ -49,7 +88,25 @@ define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { } define void @infer.sext.1(i32 %start, ptr %c) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.1 +; CHECK-LABEL: 'infer.sext.1' +; CHECK-NEXT: Classifying expressions for: @infer.sext.1 +; CHECK-NEXT: %start.mul = mul i32 %start, 4 +; CHECK-NEXT: --> (4 * %start) U: [0,-3) S: [-2147483648,2147483645) +; CHECK-NEXT: %start.real = add i32 %start.mul, 2 +; CHECK-NEXT: --> (2 + (4 * %start)) U: [2,-1) S: [-2147483646,2147483647) +; CHECK-NEXT: %idx = phi i32 [ %start.real, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(2 + (4 * %start)),+,2}<%loop> U: [0,-1) S: [-2147483646,2147483647) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.sext = sext i32 %idx to i64 +; CHECK-NEXT: --> {(2 + (sext i32 (4 * %start) to i64)),+,2}<%loop> U: [0,-1) S: [-2147483646,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i32 %idx, 2 +; CHECK-NEXT: --> {(4 + (4 * %start)),+,2}<%loop> U: [0,-1) S: [-2147483648,2147483647) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.1 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.mul = mul i32 %start, 4 %start.real = add i32 %start.mul, 2 @@ -58,8 +115,6 @@ define void @infer.sext.1(i32 %start, ptr %c) { loop: %idx = phi i32 [ %start.real, %entry ], [ %idx.inc, %loop ] %idx.sext = sext i32 %idx to i64 -; CHECK: %idx.sext = sext i32 %idx to i64 -; CHECK-NEXT: --> {(2 + (sext i32 (4 * %start) to i64)),+,2}<%loop> %idx.inc = add nsw i32 %idx, 2 %condition = load i1, ptr %c br i1 %condition, label %exit, label %loop @@ -69,7 +124,23 @@ define void @infer.sext.1(i32 %start, ptr %c) { } define void @infer.sext.2(ptr %c, i8 %start) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.2 +; CHECK-LABEL: 'infer.sext.2' +; CHECK-NEXT: Classifying expressions for: @infer.sext.2 +; CHECK-NEXT: %start.inc = add i8 %start, 1 +; CHECK-NEXT: --> (1 + %start) U: full-set S: full-set +; CHECK-NEXT: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.sext = sext i8 %idx to i16 +; CHECK-NEXT: --> {(1 + (sext i8 %start to i16)),+,1}<%loop> U: [-127,-32768) S: [-127,-32768) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i8 %idx, 1 +; CHECK-NEXT: --> {(2 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load volatile i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.2 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.inc = add i8 %start, 1 %entry.condition = icmp slt i8 %start, 127 @@ -78,8 +149,6 @@ define void @infer.sext.2(ptr %c, i8 %start) { loop: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] %idx.sext = sext i8 %idx to i16 -; CHECK: %idx.sext = sext i8 %idx to i16 -; CHECK-NEXT: --> {(1 + (sext i8 %start to i16)),+,1}<%loop> %idx.inc = add nsw i8 %idx, 1 %condition = load volatile i1, ptr %c br i1 %condition, label %exit, label %loop @@ -89,7 +158,23 @@ define void @infer.sext.2(ptr %c, i8 %start) { } define void @infer.zext.1(ptr %c, i8 %start) { -; CHECK-LABEL: Classifying expressions for: @infer.zext.1 +; CHECK-LABEL: 'infer.zext.1' +; CHECK-NEXT: Classifying expressions for: @infer.zext.1 +; CHECK-NEXT: %start.inc = add i8 %start, 1 +; CHECK-NEXT: --> (1 + %start) U: full-set S: full-set +; CHECK-NEXT: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.zext = zext i8 %idx to i16 +; CHECK-NEXT: --> {(1 + (zext i8 %start to i16)),+,1}<%loop> U: [1,0) S: [1,0) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nuw i8 %idx, 1 +; CHECK-NEXT: --> {(2 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load volatile i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.zext.1 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.inc = add i8 %start, 1 %entry.condition = icmp ult i8 %start, 255 @@ -98,8 +183,6 @@ define void @infer.zext.1(ptr %c, i8 %start) { loop: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] %idx.zext = zext i8 %idx to i16 -; CHECK: %idx.zext = zext i8 %idx to i16 -; CHECK-NEXT: --> {(1 + (zext i8 %start to i16)),+,1}<%loop> %idx.inc = add nuw i8 %idx, 1 %condition = load volatile i1, ptr %c br i1 %condition, label %exit, label %loop diff --git a/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll b/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll new file mode 100644 index 0000000000000..865c10e3913aa --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=indvars -S < %s | FileCheck %s + +declare void @use(i1) + +declare void @llvm.experimental.guard(i1, ...) + +define void @test_01(i8 %t) { +; CHECK-LABEL: define void @test_01( +; CHECK-SAME: i8 [[T:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ST:%.*]] = sext i8 [[T]] to i16 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i16 [[ST]], 42 +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CMP1]]) [ "deopt"() ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[T]], %[[ENTRY]] ], [ [[IDX_INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IDX_INC]] = add nsw i8 [[IDX]], 1 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[BE:%.*]] = icmp slt i8 [[IDX_INC]], 42 +; CHECK-NEXT: br i1 [[BE]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + entry: + %st = sext i8 %t to i16 + %cmp1 = icmp slt i16 %st, 42 + call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] + br label %loop + + loop: + %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ] + %idx.inc = add i8 %idx, 1 + %c = icmp slt i8 %idx, 42 + call void @use(i1 %c) + %be = icmp slt i8 %idx.inc, 42 + br i1 %be, label %loop, label %exit + + exit: + ret void +} + +define void @test_02(i8 %t) { +; CHECK-LABEL: define void @test_02( +; CHECK-SAME: i8 [[T:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[T_PTR:%.*]] = inttoptr i8 [[T]] to ptr +; CHECK-NEXT: [[P_42:%.*]] = inttoptr i8 42 to ptr +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt ptr [[T_PTR]], [[P_42]] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CMP1]]) [ "deopt"() ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IDX:%.*]] = phi ptr [ [[T_PTR]], %[[ENTRY]] ], [ [[SNEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SNEXT]] = getelementptr inbounds i8, ptr [[IDX]], i64 1 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[BE:%.*]] = icmp ult ptr [[SNEXT]], [[P_42]] +; CHECK-NEXT: br i1 [[BE]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + entry: + %t.ptr = inttoptr i8 %t to ptr + %p.42 = inttoptr i8 42 to ptr + %cmp1 = icmp slt ptr %t.ptr, %p.42 + call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] + br label %loop + + loop: + %idx = phi ptr [ %t.ptr, %entry ], [ %snext, %loop ] + %snext = getelementptr inbounds i8, ptr %idx, i64 1 + %c = icmp slt ptr %idx, %p.42 + call void @use(i1 %c) + %be = icmp slt ptr %snext, %p.42 + br i1 %be, label %loop, label %exit + + exit: + ret void +} From f22af59336d45d2a000f1033be0203340bf8ad36 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 15 Jan 2025 15:21:06 +0100 Subject: [PATCH 69/82] [LLD][COFF] Move symbol mangling and lookup helpers to SymbolTable class (NFC) (#122836) This refactor prepares for further ARM64X hybrid support, where these helpers will need to work with either the native or EC symbol table based on context. --- lld/COFF/Driver.cpp | 162 ++++++++------------------------------- lld/COFF/Driver.h | 19 ----- lld/COFF/SymbolTable.cpp | 106 +++++++++++++++++++++++++ lld/COFF/SymbolTable.h | 17 ++++ 4 files changed, 153 insertions(+), 151 deletions(-) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 0d89457046a50..6af6b4f730766 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -37,7 +37,6 @@ #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GlobPattern.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Parallel.h" @@ -172,23 +171,10 @@ static std::future createFutureForFile(std::string path) { }); } -// Symbol names are mangled by prepending "_" on x86. -StringRef LinkerDriver::mangle(StringRef sym) { - assert(ctx.config.machine != IMAGE_FILE_MACHINE_UNKNOWN); - if (ctx.config.machine == I386) - return saver().save("_" + sym); - return sym; -} - llvm::Triple::ArchType LinkerDriver::getArch() { return getMachineArchType(ctx.config.machine); } -bool LinkerDriver::findUnderscoreMangle(StringRef sym) { - Symbol *s = ctx.symtab.findMangle(mangle(sym)); - return s && !isa(s); -} - static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) { if (mt == IMAGE_FILE_MACHINE_UNKNOWN) return true; @@ -486,7 +472,7 @@ void LinkerDriver::parseDirectives(InputFile *file) { SmallVector vec; e.split(vec, ','); for (StringRef sym : vec) - excludedSymbols.insert(mangle(sym)); + excludedSymbols.insert(file->symtab.mangle(sym)); } // https://docs.microsoft.com/en-us/cpp/preprocessor/comment-c-cpp?view=msvc-160 @@ -505,7 +491,8 @@ void LinkerDriver::parseDirectives(InputFile *file) { case OPT_entry: if (!arg->getValue()[0]) Fatal(ctx) << "missing entry point symbol name"; - ctx.config.entry = file->symtab.addGCRoot(mangle(arg->getValue()), true); + ctx.config.entry = + file->symtab.addGCRoot(file->symtab.mangle(arg->getValue()), true); break; case OPT_failifmismatch: checkFailIfMismatch(arg->getValue(), file); @@ -805,97 +792,6 @@ void LinkerDriver::addLibSearchPaths() { } } -void LinkerDriver::addUndefinedGlob(StringRef arg) { - Expected pat = GlobPattern::create(arg); - if (!pat) { - Err(ctx) << "/includeglob: " << toString(pat.takeError()); - return; - } - - SmallVector syms; - ctx.symtab.forEachSymbol([&syms, &pat](Symbol *sym) { - if (pat->match(sym->getName())) { - syms.push_back(sym); - } - }); - - for (Symbol *sym : syms) - ctx.symtab.addGCRoot(sym->getName()); -} - -StringRef LinkerDriver::mangleMaybe(Symbol *s) { - // If the plain symbol name has already been resolved, do nothing. - Undefined *unmangled = dyn_cast(s); - if (!unmangled) - return ""; - - // Otherwise, see if a similar, mangled symbol exists in the symbol table. - Symbol *mangled = ctx.symtab.findMangle(unmangled->getName()); - if (!mangled) - return ""; - - // If we find a similar mangled symbol, make this an alias to it and return - // its name. - Log(ctx) << unmangled->getName() << " aliased to " << mangled->getName(); - unmangled->setWeakAlias(ctx.symtab.addUndefined(mangled->getName())); - return mangled->getName(); -} - -// Windows specific -- find default entry point name. -// -// There are four different entry point functions for Windows executables, -// each of which corresponds to a user-defined "main" function. This function -// infers an entry point from a user-defined "main" function. -StringRef LinkerDriver::findDefaultEntry() { - assert(ctx.config.subsystem != IMAGE_SUBSYSTEM_UNKNOWN && - "must handle /subsystem before calling this"); - - if (ctx.config.mingw) - return mangle(ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI - ? "WinMainCRTStartup" - : "mainCRTStartup"); - - if (ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) { - if (findUnderscoreMangle("wWinMain")) { - if (!findUnderscoreMangle("WinMain")) - return mangle("wWinMainCRTStartup"); - Warn(ctx) << "found both wWinMain and WinMain; using latter"; - } - return mangle("WinMainCRTStartup"); - } - if (findUnderscoreMangle("wmain")) { - if (!findUnderscoreMangle("main")) - return mangle("wmainCRTStartup"); - Warn(ctx) << "found both wmain and main; using latter"; - } - return mangle("mainCRTStartup"); -} - -WindowsSubsystem LinkerDriver::inferSubsystem() { - if (ctx.config.dll) - return IMAGE_SUBSYSTEM_WINDOWS_GUI; - if (ctx.config.mingw) - return IMAGE_SUBSYSTEM_WINDOWS_CUI; - // Note that link.exe infers the subsystem from the presence of these - // functions even if /entry: or /nodefaultlib are passed which causes them - // to not be called. - bool haveMain = findUnderscoreMangle("main"); - bool haveWMain = findUnderscoreMangle("wmain"); - bool haveWinMain = findUnderscoreMangle("WinMain"); - bool haveWWinMain = findUnderscoreMangle("wWinMain"); - if (haveMain || haveWMain) { - if (haveWinMain || haveWWinMain) { - Warn(ctx) << "found " << (haveMain ? "main" : "wmain") << " and " - << (haveWinMain ? "WinMain" : "wWinMain") - << "; defaulting to /subsystem:console"; - } - return IMAGE_SUBSYSTEM_WINDOWS_CUI; - } - if (haveWinMain || haveWWinMain) - return IMAGE_SUBSYSTEM_WINDOWS_GUI; - return IMAGE_SUBSYSTEM_UNKNOWN; -} - uint64_t LinkerDriver::getDefaultImageBase() { if (ctx.config.is64()) return ctx.config.dll ? 0x180000000 : 0x140000000; @@ -1539,7 +1435,7 @@ void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) { SmallVector vec; StringRef(arg->getValue()).split(vec, ','); for (StringRef sym : vec) - exporter.addExcludedSymbol(mangle(sym)); + exporter.addExcludedSymbol(ctx.symtab.mangle(sym)); } ctx.symtab.forEachSymbol([&](Symbol *s) { @@ -2455,7 +2351,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // and after the early return when just writing an import library. if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) { llvm::TimeTraceScope timeScope("Infer subsystem"); - config->subsystem = inferSubsystem(); + config->subsystem = ctx.symtab.inferSubsystem(); if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) Fatal(ctx) << "subsystem must be defined"; } @@ -2466,7 +2362,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_entry)) { if (!arg->getValue()[0]) Fatal(ctx) << "missing entry point symbol name"; - config->entry = ctx.symtab.addGCRoot(mangle(arg->getValue()), true); + config->entry = + ctx.symtab.addGCRoot(ctx.symtab.mangle(arg->getValue()), true); } else if (!config->entry && !config->noEntry) { if (args.hasArg(OPT_dll)) { StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12" @@ -2474,11 +2371,12 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { config->entry = ctx.symtab.addGCRoot(s, true); } else if (config->driverWdm) { // /driver:wdm implies /entry:_NtProcessStartup - config->entry = ctx.symtab.addGCRoot(mangle("_NtProcessStartup"), true); + config->entry = + ctx.symtab.addGCRoot(ctx.symtab.mangle("_NtProcessStartup"), true); } else { // Windows specific -- If entry point name is not given, we need to // infer that from user-defined entry name. - StringRef s = findDefaultEntry(); + StringRef s = ctx.symtab.findDefaultEntry(); if (s.empty()) Fatal(ctx) << "entry point must be defined"; config->entry = ctx.symtab.addGCRoot(s, true); @@ -2568,24 +2466,24 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { config->imageBase = getDefaultImageBase(); ctx.forEachSymtab([&](SymbolTable &symtab) { - symtab.addSynthetic(mangle("__ImageBase"), nullptr); + symtab.addSynthetic(symtab.mangle("__ImageBase"), nullptr); if (symtab.machine == I386) { symtab.addAbsolute("___safe_se_handler_table", 0); symtab.addAbsolute("___safe_se_handler_count", 0); } - symtab.addAbsolute(mangle("__guard_fids_count"), 0); - symtab.addAbsolute(mangle("__guard_fids_table"), 0); - symtab.addAbsolute(mangle("__guard_flags"), 0); - symtab.addAbsolute(mangle("__guard_iat_count"), 0); - symtab.addAbsolute(mangle("__guard_iat_table"), 0); - symtab.addAbsolute(mangle("__guard_longjmp_count"), 0); - symtab.addAbsolute(mangle("__guard_longjmp_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_fids_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_fids_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_flags"), 0); + symtab.addAbsolute(symtab.mangle("__guard_iat_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_iat_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_longjmp_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_longjmp_table"), 0); // Needed for MSVC 2017 15.5 CRT. - symtab.addAbsolute(mangle("__enclave_config"), 0); + symtab.addAbsolute(symtab.mangle("__enclave_config"), 0); // Needed for MSVC 2019 16.8 CRT. - symtab.addAbsolute(mangle("__guard_eh_cont_count"), 0); - symtab.addAbsolute(mangle("__guard_eh_cont_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_eh_cont_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_eh_cont_table"), 0); if (symtab.isEC()) { symtab.addAbsolute("__arm64x_extra_rfe_table", 0); @@ -2606,16 +2504,16 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } if (config->pseudoRelocs) { - symtab.addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0); - symtab.addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0); + symtab.addAbsolute(symtab.mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0); } if (config->mingw) { - symtab.addAbsolute(mangle("__CTOR_LIST__"), 0); - symtab.addAbsolute(mangle("__DTOR_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__CTOR_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__DTOR_LIST__"), 0); } if (config->debug || config->buildIDHash != BuildIDHash::None) if (symtab.findUnderscore("__buildid")) - symtab.addUndefined(mangle("__buildid")); + symtab.addUndefined(symtab.mangle("__buildid")); }); // This code may add new undefined symbols to the link, which may enqueue more @@ -2627,7 +2525,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Windows specific -- if entry point is not found, // search for its mangled names. if (config->entry) - mangleMaybe(config->entry); + ctx.symtab.mangleMaybe(config->entry); // Windows specific -- Make sure we resolve all dllexported symbols. for (Export &e : config->exports) { @@ -2635,7 +2533,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { continue; e.sym = ctx.symtab.addGCRoot(e.name, !e.data); if (e.source != ExportSource::Directives) - e.symbolName = mangleMaybe(e.sym); + e.symbolName = ctx.symtab.mangleMaybe(e.sym); } // Add weak aliases. Weak aliases is a mechanism to give remaining @@ -2675,7 +2573,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Windows specific -- if __load_config_used can be resolved, resolve it. if (ctx.symtab.findUnderscore("_load_config_used")) - ctx.symtab.addGCRoot(mangle("_load_config_used")); + ctx.symtab.addGCRoot(ctx.symtab.mangle("_load_config_used")); if (args.hasArg(OPT_include_optional)) { // Handle /includeoptional @@ -2688,7 +2586,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Handle /includeglob for (StringRef pat : args::getStrings(args, OPT_incl_glob)) - addUndefinedGlob(pat); + ctx.symtab.addUndefinedGlob(pat); // Create wrapped symbols for -wrap option. std::vector wrapped = addWrappedSymbols(ctx, args); diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index 9d4f1cbfcb584..4558f68c041fa 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -106,8 +106,6 @@ class LinkerDriver { StringRef findLib(StringRef filename); StringRef findLibMinGW(StringRef filename); - bool findUnderscoreMangle(StringRef sym); - // Determines the location of the sysroot based on `args`, environment, etc. void detectWinSysRoot(const llvm::opt::InputArgList &args); @@ -115,9 +113,6 @@ class LinkerDriver { // config.machine has been set. void addWinSysRootLibSearchPaths(); - // Symbol names are mangled by prepending "_" on x86. - StringRef mangle(StringRef sym); - void setMachine(llvm::COFF::MachineTypes machine); llvm::Triple::ArchType getArch(); @@ -173,20 +168,6 @@ class LinkerDriver { std::set visitedLibs; - void addUndefinedGlob(StringRef arg); - - StringRef mangleMaybe(Symbol *s); - - // Windows specific -- "main" is not the only main function in Windows. - // You can choose one from these four -- {w,}{WinMain,main}. - // There are four different entry point functions for them, - // {w,}{WinMain,main}CRTStartup, respectively. The linker needs to - // choose the right one depending on which "main" function is defined. - // This function looks up the symbol table and resolve corresponding - // entry point name. - StringRef findDefaultEntry(); - WindowsSubsystem inferSubsystem(); - void addBuffer(std::unique_ptr mb, bool wholeArchive, bool lazy); void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName, diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index b2f3ffe780e5d..7c43ada3d136e 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -21,12 +21,14 @@ #include "llvm/IR/Mangler.h" #include "llvm/LTO/LTO.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GlobPattern.h" #include "llvm/Support/Parallel.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; +using namespace llvm::COFF; using namespace llvm::support; namespace lld::coff { @@ -1022,6 +1024,110 @@ Symbol *SymbolTable::findMangle(StringRef name) { return findByPrefix("?" + name.substr(1) + "@@Y"); } +bool SymbolTable::findUnderscoreMangle(StringRef sym) { + Symbol *s = findMangle(mangle(sym)); + return s && !isa(s); +} + +// Symbol names are mangled by prepending "_" on x86. +StringRef SymbolTable::mangle(StringRef sym) { + assert(machine != IMAGE_FILE_MACHINE_UNKNOWN); + if (machine == I386) + return saver().save("_" + sym); + return sym; +} + +StringRef SymbolTable::mangleMaybe(Symbol *s) { + // If the plain symbol name has already been resolved, do nothing. + Undefined *unmangled = dyn_cast(s); + if (!unmangled) + return ""; + + // Otherwise, see if a similar, mangled symbol exists in the symbol table. + Symbol *mangled = findMangle(unmangled->getName()); + if (!mangled) + return ""; + + // If we find a similar mangled symbol, make this an alias to it and return + // its name. + Log(ctx) << unmangled->getName() << " aliased to " << mangled->getName(); + unmangled->setWeakAlias(addUndefined(mangled->getName())); + return mangled->getName(); +} + +// Windows specific -- find default entry point name. +// +// There are four different entry point functions for Windows executables, +// each of which corresponds to a user-defined "main" function. This function +// infers an entry point from a user-defined "main" function. +StringRef SymbolTable::findDefaultEntry() { + assert(ctx.config.subsystem != IMAGE_SUBSYSTEM_UNKNOWN && + "must handle /subsystem before calling this"); + + if (ctx.config.mingw) + return mangle(ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI + ? "WinMainCRTStartup" + : "mainCRTStartup"); + + if (ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) { + if (findUnderscoreMangle("wWinMain")) { + if (!findUnderscoreMangle("WinMain")) + return mangle("wWinMainCRTStartup"); + Warn(ctx) << "found both wWinMain and WinMain; using latter"; + } + return mangle("WinMainCRTStartup"); + } + if (findUnderscoreMangle("wmain")) { + if (!findUnderscoreMangle("main")) + return mangle("wmainCRTStartup"); + Warn(ctx) << "found both wmain and main; using latter"; + } + return mangle("mainCRTStartup"); +} + +WindowsSubsystem SymbolTable::inferSubsystem() { + if (ctx.config.dll) + return IMAGE_SUBSYSTEM_WINDOWS_GUI; + if (ctx.config.mingw) + return IMAGE_SUBSYSTEM_WINDOWS_CUI; + // Note that link.exe infers the subsystem from the presence of these + // functions even if /entry: or /nodefaultlib are passed which causes them + // to not be called. + bool haveMain = findUnderscoreMangle("main"); + bool haveWMain = findUnderscoreMangle("wmain"); + bool haveWinMain = findUnderscoreMangle("WinMain"); + bool haveWWinMain = findUnderscoreMangle("wWinMain"); + if (haveMain || haveWMain) { + if (haveWinMain || haveWWinMain) { + Warn(ctx) << "found " << (haveMain ? "main" : "wmain") << " and " + << (haveWinMain ? "WinMain" : "wWinMain") + << "; defaulting to /subsystem:console"; + } + return IMAGE_SUBSYSTEM_WINDOWS_CUI; + } + if (haveWinMain || haveWWinMain) + return IMAGE_SUBSYSTEM_WINDOWS_GUI; + return IMAGE_SUBSYSTEM_UNKNOWN; +} + +void SymbolTable::addUndefinedGlob(StringRef arg) { + Expected pat = GlobPattern::create(arg); + if (!pat) { + Err(ctx) << "/includeglob: " << toString(pat.takeError()); + return; + } + + SmallVector syms; + forEachSymbol([&syms, &pat](Symbol *sym) { + if (pat->match(sym->getName())) { + syms.push_back(sym); + } + }); + + for (Symbol *sym : syms) + addGCRoot(sym->getName()); +} + Symbol *SymbolTable::addUndefined(StringRef name) { return addUndefined(name, nullptr, false); } diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 4c749ae059d27..1de0b3e1deac3 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -74,11 +74,27 @@ class SymbolTable { Symbol *find(StringRef name) const; Symbol *findUnderscore(StringRef name) const; + void addUndefinedGlob(StringRef arg); + // Occasionally we have to resolve an undefined symbol to its // mangled symbol. This function tries to find a mangled name // for U from the symbol table, and if found, set the symbol as // a weak alias for U. Symbol *findMangle(StringRef name); + StringRef mangleMaybe(Symbol *s); + + // Symbol names are mangled by prepending "_" on x86. + StringRef mangle(StringRef sym); + + // Windows specific -- "main" is not the only main function in Windows. + // You can choose one from these four -- {w,}{WinMain,main}. + // There are four different entry point functions for them, + // {w,}{WinMain,main}CRTStartup, respectively. The linker needs to + // choose the right one depending on which "main" function is defined. + // This function looks up the symbol table and resolve corresponding + // entry point name. + StringRef findDefaultEntry(); + WindowsSubsystem inferSubsystem(); // Build a set of COFF objects representing the combined contents of // BitcodeFiles and add them to the symbol table. Called after all files are @@ -152,6 +168,7 @@ class SymbolTable { /// Same as insert(Name), but also sets isUsedInRegularObj. std::pair insert(StringRef name, InputFile *f); + bool findUnderscoreMangle(StringRef sym); std::vector getSymsWithPrefix(StringRef prefix); llvm::DenseMap symMap; From c82a6a025179e3b155c70f5ad8f84fa8ec2a9452 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 15 Jan 2025 14:28:42 +0000 Subject: [PATCH 70/82] [AMDGPU] Use correct vector elt type when shrinking mfma scale (#123043) This might be a copy/paste error. I don't think this an issue in practice as the builtins/intrinsics are only legal with identical vector element types. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 8b1b398606583..bac3bb5fde7b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1346,7 +1346,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (Src1Ty->getNumElements() > Src1NumElts) { Src1 = IC.Builder.CreateExtractVector( - FixedVectorType::get(Src0Ty->getElementType(), Src1NumElts), Src1, + FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1, IC.Builder.getInt64(0)); MadeChange = true; } From e9504c52edd796a22a879b381f17bd8ed235bfd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 15 Jan 2025 14:54:16 +0000 Subject: [PATCH 71/82] [mlir][vector] Add tests for populateSinkVectorOpsPatterns (2/N) (#122338) Adds tests for scalable vectors in: * "vector-sink.mlir". This test file exercises patterns included in `populateSinkVectorOpsPatterns`: * `ReorderElementwiseOpsOnBroadcast`, * `ReorderCastOpsOnBroadcast`, * `ReorderElementwiseOpsOnTranspose`. This PR focuses on adding tests for the latter two patterns (`ReorderCastOpsOnBroadcast` and `ReorderElementwiseOpsOnTranspose`). Tests for `ReorderElementwiseOpsOnBroadcast` were added in #102286. Please note that in PR #102856, I renamed: * `populateSinkVectorBroadcastPatterns`, to * `populateSinkVectorOpsPatterns`. --- mlir/test/Dialect/Vector/vector-sink.mlir | 103 ++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir index 5a3699333265c..7ce840575a803 100644 --- a/mlir/test/Dialect/Vector/vector-sink.mlir +++ b/mlir/test/Dialect/Vector/vector-sink.mlir @@ -228,6 +228,16 @@ func.func @broadcast_vector_extsi(%a : vector<4xi8>) -> vector<2x4xi32> { // ----- +func.func @broadcast_vector_extsi_scalable(%a : vector<[4]xi8>) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : vector<[4]xi8> to vector<[4]xi32> + // CHECK: vector.broadcast %[[EXT:.+]] : vector<[4]xi32> to vector<2x[4]xi32> + %b = vector.broadcast %a : vector<[4]xi8> to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + +// ----- + func.func @broadcast_scalar_extsi(%a : i8) -> vector<2x4xi32> { // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : i8 to i32 // CHECK: vector.broadcast %[[EXT]] : i32 to vector<2x4xi32> @@ -236,6 +246,16 @@ func.func @broadcast_scalar_extsi(%a : i8) -> vector<2x4xi32> { return %r : vector<2x4xi32> } +// ----- + +func.func @broadcast_scalar_extsi_scalable(%a : i8) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : i8 to i32 + // CHECK: vector.broadcast %[[EXT]] : i32 to vector<2x[4]xi32> + %b = vector.broadcast %a : i8 to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + //===----------------------------------------------------------------------===// // [Pattern: ReorderElementwiseOpsOnTranspose] //===----------------------------------------------------------------------===// @@ -250,6 +270,16 @@ func.func @transpose_extsi(%a : vector<4x2xi8>) -> vector<2x4xi32> { // ----- +func.func @transpose_extsi_scalable(%a : vector<[4]x2xi8>) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : vector<[4]x2xi8> to vector<[4]x2xi32> + // CHECK: vector.transpose %[[EXT]], [1, 0] : vector<[4]x2xi32> to vector<2x[4]xi32> + %b = vector.transpose %a, [1, 0]: vector<[4]x2xi8> to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_same_type // CHECK-SAME: (%[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<4x2xf32> @@ -265,6 +295,21 @@ func.func @transpose_elementwise_same_type(%a : vector<4x2xf32>, %b : vector<4x2 // ----- +// CHECK-LABEL: func @transpose_elementwise_same_type_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[ADD]], [1, 0] +// CHECK: return %[[T]] + +func.func @transpose_elementwise_same_type_scalable(%a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xf32> { + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.addf %at, %bt : vector<2x[4]xf32> + return %r : vector<2x[4]xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_operand_types // CHECK-SAME: (%[[COND:.+]]: vector<4x2xi1>, %[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[S:.+]] = arith.select %[[COND]], %[[A]], %[[B]] : vector<4x2xi1>, vector<4x2xf32> @@ -280,6 +325,21 @@ func.func @transpose_elementwise_diff_operand_types(%cond: vector<4x2xi1>, %a : // ----- +// CHECK-LABEL: func @transpose_elementwise_diff_operand_types_scalable +// CHECK-SAME: (%[[COND:.+]]: vector<[4]x2xi1>, %[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[S:.+]] = arith.select %[[COND]], %[[A]], %[[B]] : vector<[4]x2xi1>, vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[S]], [1, 0] : vector<[4]x2xf32> to vector<2x[4]xf32> +// CHECK: return %[[T]] +func.func @transpose_elementwise_diff_operand_types_scalable(%cond: vector<[4]x2xi1>, %a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xf32> { + %condt = vector.transpose %cond, [1, 0]: vector<[4]x2xi1> to vector<2x[4]xi1> + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.select %condt, %at, %bt : vector<2x[4]xi1>, vector<2x[4]xf32> + return %r : vector<2x[4]xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_operand_result_type // CHECK-SAME: (%[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[CMP:.+]] = arith.cmpf olt, %[[A]], %[[B]] : vector<4x2xf32> @@ -294,6 +354,20 @@ func.func @transpose_elementwise_diff_operand_result_type(%a : vector<4x2xf32>, // ----- +// CHECK-LABEL: func @transpose_elementwise_diff_operand_result_type_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[CMP:.+]] = arith.cmpf olt, %[[A]], %[[B]] : vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[CMP]], [1, 0] : vector<[4]x2xi1> to vector<2x[4]xi1> +// CHECK: return %[[T]] +func.func @transpose_elementwise_diff_operand_result_type_scalable(%a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xi1> { + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.cmpf olt, %at, %bt : vector<2x[4]xf32> + return %r : vector<2x[4]xi1> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_splat_constant // CHECK-SAME: (%[[A:.+]]: vector<4x6x3x2xf32>) // CHECK: %[[B:.+]] = arith.constant dense<5.000000e+00> : vector<4x6x3x2xf32> @@ -310,6 +384,22 @@ func.func @transpose_elementwise_splat_constant(%a : vector<4x6x3x2xf32>) -> vec // ----- +// CHECK-LABEL: func @transpose_elementwise_splat_constant_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x6x3x2xf32>) +// CHECK: %[[B:.+]] = arith.constant dense<5.000000e+00> : vector<[4]x6x3x2xf32> +// CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<[4]x6x3x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[ADD]], [1, 0, 3, 2] : vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> +// CHECK: return %[[T:.+]] : vector<6x[4]x2x3xf32> + +func.func @transpose_elementwise_splat_constant_scalable(%a : vector<[4]x6x3x2xf32>) -> vector<6x[4]x2x3xf32> { + %b = arith.constant dense<5.0> : vector<6x[4]x2x3xf32> + %at = vector.transpose %a, [1, 0, 3, 2]: vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> + %r = arith.addf %at, %b : vector<6x[4]x2x3xf32> + return %r : vector<6x[4]x2x3xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_map // CHECK: vector.transpose // CHECK: vector.transpose @@ -320,3 +410,16 @@ func.func @transpose_elementwise_diff_map(%a : vector<4x6x3x2xf32>, %b: vector<6 %r = arith.addf %at, %bt : vector<6x4x2x3xf32> return %r : vector<6x4x2x3xf32> } + +// ----- + +// CHECK-LABEL: func @transpose_elementwise_diff_map_scalable +// CHECK: vector.transpose +// CHECK: vector.transpose +// CHECK: arith.addf +func.func @transpose_elementwise_diff_map_scalable(%a : vector<[4]x6x3x2xf32>, %b: vector<6x2x[4]x3xf32>) -> vector<6x[4]x2x3xf32> { + %at = vector.transpose %a, [1, 0, 3, 2]: vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> + %bt = vector.transpose %b, [0, 2, 1, 3]: vector<6x2x[4]x3xf32> to vector<6x[4]x2x3xf32> + %r = arith.addf %at, %bt : vector<6x[4]x2x3xf32> + return %r : vector<6x[4]x2x3xf32> +} From c593e3d0f77509ce65a6f5bd744f2d1ea9935c47 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Wed, 15 Jan 2025 15:06:57 +0000 Subject: [PATCH 72/82] =?UTF-8?q?[Flang][Driver]=20Add=20a=20flag=20to=20c?= =?UTF-8?q?ontrol=20zero=20initialization=20of=20global=20v=E2=80=A6=20(#1?= =?UTF-8?q?22144)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ariables Patch adds a flag to control zero initialization of global variables without default initialization. The default is to zero initialize. --- clang/include/clang/Driver/Options.td | 5 +++++ clang/lib/Driver/ToolChains/Flang.cpp | 4 +++- flang/include/flang/Lower/LoweringOptions.def | 3 +++ flang/lib/Frontend/CompilerInvocation.cpp | 8 ++++++++ flang/lib/Lower/ConvertVariable.cpp | 6 +++++- flang/test/Driver/fno-zero-init.f90 | 9 +++++++++ flang/test/Lower/zero_init.f90 | 17 +++++++++++++++++ flang/test/Lower/zero_init_default_init.f90 | 19 +++++++++++++++++++ 8 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 flang/test/Driver/fno-zero-init.f90 create mode 100644 flang/test/Lower/zero_init.f90 create mode 100644 flang/test/Lower/zero_init_default_init.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2721c1b5d8dc5..dacfca910acc4 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3505,6 +3505,11 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, Visibility<[ClangOption, FlangOption]>; +defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero", + PosFlag, + NegFlag>; def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group; def fno_temp_file : Flag<["-"], "fno-temp-file">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText< diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index a7d0cc99f27d2..c46e8222a9631 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -155,7 +155,9 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, - options::OPT_ftime_report, options::OPT_ftime_report_EQ}); + options::OPT_finit_global_zero, + options::OPT_fno_init_global_zero, options::OPT_ftime_report, + options::OPT_ftime_report_EQ}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 5a6debfdffe03..396c91948be36 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -44,5 +44,8 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0) /// If false, assume that the shapes/types/allocation-status match. ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1) +/// If true, initialize globals without initialization to zero. +/// On by default. +ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1) #undef LOWERINGOPT #undef ENUM_LOWERINGOPT diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 5e7127313c133..78d1199c19749 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1373,6 +1373,14 @@ bool CompilerInvocation::createFromArgs( invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } + // -f[no-]init-global-zero + if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, + clang::driver::options::OPT_fno_init_global_zero, + /*default=*/true)) + invoc.loweringOpts.setInitGlobalZero(true); + else + invoc.loweringOpts.setInitGlobalZero(false); + // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 9ee42d5cd8800..87236dc293ebb 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -635,7 +635,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, global.setLinkName(builder.createCommonLinkage()); Fortran::lower::createGlobalInitialization( builder, global, [&](fir::FirOpBuilder &builder) { - mlir::Value initValue = builder.create(loc, symTy); + mlir::Value initValue; + if (converter.getLoweringOptions().getInitGlobalZero()) + initValue = builder.create(loc, symTy); + else + initValue = builder.create(loc, symTy); builder.create(loc, initValue); }); } diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90 new file mode 100644 index 0000000000000..2ffa10dd040d5 --- /dev/null +++ b/flang/test/Driver/fno-zero-init.f90 @@ -0,0 +1,9 @@ +! Check that the driver passes through -f[no-]init-global-zero: +! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s +! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s +! Check that the compiler accepts -f[no-]init-global-zero: +! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o - +! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o - + +! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero" +! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero" diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90 new file mode 100644 index 0000000000000..89e6584f410f7 --- /dev/null +++ b/flang/test/Lower/zero_init.f90 @@ -0,0 +1,17 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s +! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s + +module m1 + real :: x +end module m1 + +!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 { +!CHECK-DEFAULT: %[[UNDEF:.*]] = fir.zero_bits f32 +!CHECK-DEFAULT: fir.has_value %[[UNDEF]] : f32 +!CHECK-DEFAULT: } + +!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 { +!CHECK-NO-ZERO-INIT: %[[UNDEF:.*]] = fir.undefined f32 +!CHECK-NO-ZERO-INIT: fir.has_value %[[UNDEF]] : f32 +!CHECK-NO-ZERO-INIT: } diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90 new file mode 100644 index 0000000000000..761052b5b08a0 --- /dev/null +++ b/flang/test/Lower/zero_init_default_init.f90 @@ -0,0 +1,19 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s +! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s +! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s + +! Test that the flag does not affect globals with default init + +module m2 + type val + integer :: my_val = 1 + end type val + type(val) :: v1 +end module m2 + +!CHECK: fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> { +!CHECK: %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: %[[ONE:.*]] = arith.constant 1 : i32 +!CHECK: %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}> +!CHECK: } From 44ba43aa2b740878d83a9d6f1d52a333c0d48c22 Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Wed, 15 Jan 2025 15:23:34 +0000 Subject: [PATCH 73/82] =?UTF-8?q?Revert=20"[Flang][Driver]=20Add=20a=20fla?= =?UTF-8?q?g=20to=20control=20zero=20initialization=20of=20global=20v?= =?UTF-8?q?=E2=80=A6"=20(#123067)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts llvm/llvm-project#122144 Reverting due to CI failure https://lab.llvm.org/buildbot/#/builders/89/builds/14422 --- clang/include/clang/Driver/Options.td | 5 ----- clang/lib/Driver/ToolChains/Flang.cpp | 4 +--- flang/include/flang/Lower/LoweringOptions.def | 3 --- flang/lib/Frontend/CompilerInvocation.cpp | 8 -------- flang/lib/Lower/ConvertVariable.cpp | 6 +----- flang/test/Driver/fno-zero-init.f90 | 9 --------- flang/test/Lower/zero_init.f90 | 17 ----------------- flang/test/Lower/zero_init_default_init.f90 | 19 ------------------- 8 files changed, 2 insertions(+), 69 deletions(-) delete mode 100644 flang/test/Driver/fno-zero-init.f90 delete mode 100644 flang/test/Lower/zero_init.f90 delete mode 100644 flang/test/Lower/zero_init_default_init.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index dacfca910acc4..2721c1b5d8dc5 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3505,11 +3505,6 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, Visibility<[ClangOption, FlangOption]>; -defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero", - PosFlag, - NegFlag>; def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group; def fno_temp_file : Flag<["-"], "fno-temp-file">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText< diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index c46e8222a9631..a7d0cc99f27d2 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -155,9 +155,7 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, - options::OPT_finit_global_zero, - options::OPT_fno_init_global_zero, options::OPT_ftime_report, - options::OPT_ftime_report_EQ}); + options::OPT_ftime_report, options::OPT_ftime_report_EQ}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 396c91948be36..5a6debfdffe03 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -44,8 +44,5 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0) /// If false, assume that the shapes/types/allocation-status match. ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1) -/// If true, initialize globals without initialization to zero. -/// On by default. -ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1) #undef LOWERINGOPT #undef ENUM_LOWERINGOPT diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 78d1199c19749..5e7127313c133 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1373,14 +1373,6 @@ bool CompilerInvocation::createFromArgs( invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } - // -f[no-]init-global-zero - if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, - clang::driver::options::OPT_fno_init_global_zero, - /*default=*/true)) - invoc.loweringOpts.setInitGlobalZero(true); - else - invoc.loweringOpts.setInitGlobalZero(false); - // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 87236dc293ebb..9ee42d5cd8800 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -635,11 +635,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter, global.setLinkName(builder.createCommonLinkage()); Fortran::lower::createGlobalInitialization( builder, global, [&](fir::FirOpBuilder &builder) { - mlir::Value initValue; - if (converter.getLoweringOptions().getInitGlobalZero()) - initValue = builder.create(loc, symTy); - else - initValue = builder.create(loc, symTy); + mlir::Value initValue = builder.create(loc, symTy); builder.create(loc, initValue); }); } diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90 deleted file mode 100644 index 2ffa10dd040d5..0000000000000 --- a/flang/test/Driver/fno-zero-init.f90 +++ /dev/null @@ -1,9 +0,0 @@ -! Check that the driver passes through -f[no-]init-global-zero: -! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s -! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s -! Check that the compiler accepts -f[no-]init-global-zero: -! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o - -! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o - - -! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero" -! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero" diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90 deleted file mode 100644 index 89e6584f410f7..0000000000000 --- a/flang/test/Lower/zero_init.f90 +++ /dev/null @@ -1,17 +0,0 @@ -! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s -! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s - -module m1 - real :: x -end module m1 - -!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 { -!CHECK-DEFAULT: %[[UNDEF:.*]] = fir.zero_bits f32 -!CHECK-DEFAULT: fir.has_value %[[UNDEF]] : f32 -!CHECK-DEFAULT: } - -!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 { -!CHECK-NO-ZERO-INIT: %[[UNDEF:.*]] = fir.undefined f32 -!CHECK-NO-ZERO-INIT: fir.has_value %[[UNDEF]] : f32 -!CHECK-NO-ZERO-INIT: } diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90 deleted file mode 100644 index 761052b5b08a0..0000000000000 --- a/flang/test/Lower/zero_init_default_init.f90 +++ /dev/null @@ -1,19 +0,0 @@ -! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s -! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s -! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s - -! Test that the flag does not affect globals with default init - -module m2 - type val - integer :: my_val = 1 - end type val - type(val) :: v1 -end module m2 - -!CHECK: fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> { -!CHECK: %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: %[[ONE:.*]] = arith.constant 1 : i32 -!CHECK: %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}> -!CHECK: } From ff862d6de92f478253a332ec48cfc2c2add76bb3 Mon Sep 17 00:00:00 2001 From: vdonaldson <37090318+vdonaldson@users.noreply.github.com> Date: Wed, 15 Jan 2025 10:55:09 -0500 Subject: [PATCH 74/82] [flang] Modifications to ieee floating point environment procedures (#121949) Intrinsic module procedures ieee_get_modes, ieee_set_modes, ieee_get_status, and ieee_set_status store and retrieve opaque data values whose size varies by machine and OS environment. These data values are usually, but not always small. Their sizes are not directly known in a cross compilation environment. Address this issue by implementing two mechanisms for processing these data values. Environments that use typical small data sizes can access storage defined at compile time. When this is not valid, data storage of any size can be allocated at runtime. --- flang/include/flang/Evaluate/target.h | 4 + .../flang/Optimizer/Builder/IntrinsicCall.h | 6 +- .../Optimizer/Builder/Runtime/Exceptions.h | 4 + flang/include/flang/Runtime/exceptions.h | 5 + flang/include/flang/Runtime/magic-numbers.h | 9 +- flang/include/flang/Tools/TargetSetup.h | 3 + flang/lib/Evaluate/target.cpp | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 89 +++++++++---- .../Optimizer/Builder/Runtime/Exceptions.cpp | 14 ++ flang/module/__fortran_ieee_exceptions.f90 | 6 +- flang/runtime/exceptions.cpp | 41 +++--- flang/test/Lower/Intrinsics/ieee_femodes.f90 | 82 ------------ flang/test/Lower/Intrinsics/ieee_festatus.f90 | 120 ------------------ 13 files changed, 125 insertions(+), 259 deletions(-) delete mode 100644 flang/test/Lower/Intrinsics/ieee_femodes.f90 delete mode 100644 flang/test/Lower/Intrinsics/ieee_festatus.f90 diff --git a/flang/include/flang/Evaluate/target.h b/flang/include/flang/Evaluate/target.h index 154561ce868eb..e07f916b875e0 100644 --- a/flang/include/flang/Evaluate/target.h +++ b/flang/include/flang/Evaluate/target.h @@ -112,6 +112,9 @@ class TargetCharacteristics { bool isPPC() const { return isPPC_; } void set_isPPC(bool isPPC = false); + bool isSPARC() const { return isSPARC_; } + void set_isSPARC(bool isSPARC = false); + bool isOSWindows() const { return isOSWindows_; } void set_isOSWindows(bool isOSWindows = false) { isOSWindows_ = isOSWindows; @@ -126,6 +129,7 @@ class TargetCharacteristics { std::uint8_t align_[common::TypeCategory_enumSize][maxKind + 1]{}; bool isBigEndian_{false}; bool isPPC_{false}; + bool isSPARC_{false}; bool isOSWindows_{false}; bool haltingSupportIsUnknownAtCompileTime_{false}; bool areSubnormalsFlushedToZero_{false}; diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 18f84c7021e11..9c9c0609f4fc3 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -269,10 +269,8 @@ struct IntrinsicLibrary { mlir::Value genIeeeCopySign(mlir::Type, llvm::ArrayRef); void genIeeeGetFlag(llvm::ArrayRef); void genIeeeGetHaltingMode(llvm::ArrayRef); - template - void genIeeeGetOrSetModes(llvm::ArrayRef); - template - void genIeeeGetOrSetStatus(llvm::ArrayRef); + template + void genIeeeGetOrSetModesOrStatus(llvm::ArrayRef); void genIeeeGetRoundingMode(llvm::ArrayRef); void genIeeeGetUnderflowMode(llvm::ArrayRef); mlir::Value genIeeeInt(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h index f44e0c95ef6d4..7487444f3a7a9 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h @@ -33,5 +33,9 @@ mlir::Value genGetUnderflowMode(fir::FirOpBuilder &builder, mlir::Location loc); void genSetUnderflowMode(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value bit); +mlir::Value genGetModesTypeSize(fir::FirOpBuilder &builder, mlir::Location loc); +mlir::Value genGetStatusTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_EXCEPTIONS_H diff --git a/flang/include/flang/Runtime/exceptions.h b/flang/include/flang/Runtime/exceptions.h index 483d0271bcab0..62c21f01c1289 100644 --- a/flang/include/flang/Runtime/exceptions.h +++ b/flang/include/flang/Runtime/exceptions.h @@ -13,6 +13,7 @@ #include "flang/Runtime/entry-names.h" #include +#include namespace Fortran::runtime { @@ -32,6 +33,10 @@ bool RTNAME(SupportHalting)(uint32_t except); bool RTNAME(GetUnderflowMode)(void); void RTNAME(SetUnderflowMode)(bool flag); +// Get the byte size of ieee_modes_type and ieee_status_type data. +std::size_t RTNAME(GetModesTypeSize)(void); +std::size_t RTNAME(GetStatusTypeSize)(void); + } // extern "C" } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_EXCEPTIONS_H_ diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h index 1d3c5dca0b4bf..6788ba098bcf9 100644 --- a/flang/include/flang/Runtime/magic-numbers.h +++ b/flang/include/flang/Runtime/magic-numbers.h @@ -118,11 +118,10 @@ ieee_arithmetic module rounding procedures. #define _FORTRAN_RUNTIME_IEEE_OTHER 5 #if 0 -The size of derived types ieee_modes_type and ieee_status_type from intrinsic -module ieee_exceptions must be large enough to hold an fenv.h object of type -femode_t and fenv_t, respectively. These types have members that are declared -as int arrays with the following extents to allow build time validation of -these sizes in cross compilation environments. +INTEGER(kind=4) extents for ieee_exceptions module types ieee_modes_type and +ieee_status_type. These extent values are large enough to hold femode_t and +fenv_t data in many environments. An environment that does not meet these +size constraints may allocate memory with runtime size values. #endif #define _FORTRAN_RUNTIME_IEEE_FEMODE_T_EXTENT 2 #define _FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT 8 diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index 709c4bbe4b7b0..d1b0da3a42c89 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -71,6 +71,9 @@ namespace Fortran::tools { if (targetTriple.isPPC()) targetCharacteristics.set_isPPC(true); + if (targetTriple.isSPARC()) + targetCharacteristics.set_isSPARC(true); + if (targetTriple.isOSWindows()) targetCharacteristics.set_isOSWindows(true); diff --git a/flang/lib/Evaluate/target.cpp b/flang/lib/Evaluate/target.cpp index 409e28c767e1e..94dc35ecd5900 100644 --- a/flang/lib/Evaluate/target.cpp +++ b/flang/lib/Evaluate/target.cpp @@ -104,6 +104,7 @@ void TargetCharacteristics::set_isBigEndian(bool isBig) { } void TargetCharacteristics::set_isPPC(bool isPowerPC) { isPPC_ = isPowerPC; } +void TargetCharacteristics::set_isSPARC(bool isSPARC) { isSPARC_ = isSPARC; } void TargetCharacteristics::set_areSubnormalsFlushedToZero(bool yes) { areSubnormalsFlushedToZero_ = yes; diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e6d0f044dcf84..f6f2e15e469e6 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -50,6 +50,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include // temporary -- only used in genIeeeGetOrSetModesOrStatus #include #include @@ -318,13 +319,15 @@ static constexpr IntrinsicHandler handlers[]{ {"ieee_get_halting_mode", &I::genIeeeGetHaltingMode, {{{"flag", asValue}, {"halting", asAddr}}}}, - {"ieee_get_modes", &I::genIeeeGetOrSetModes}, + {"ieee_get_modes", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_get_rounding_mode", &I::genIeeeGetRoundingMode, {{{"round_value", asAddr, handleDynamicOptional}, {"radix", asValue, handleDynamicOptional}}}, /*isElemental=*/false}, - {"ieee_get_status", &I::genIeeeGetOrSetStatus}, + {"ieee_get_status", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_get_underflow_mode", &I::genIeeeGetUnderflowMode, {{{"gradual", asAddr}}}, @@ -368,13 +371,15 @@ static constexpr IntrinsicHandler handlers[]{ {"ieee_set_flag", &I::genIeeeSetFlagOrHaltingMode}, {"ieee_set_halting_mode", &I::genIeeeSetFlagOrHaltingMode}, - {"ieee_set_modes", &I::genIeeeGetOrSetModes}, + {"ieee_set_modes", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_set_rounding_mode", &I::genIeeeSetRoundingMode, {{{"round_value", asValue, handleDynamicOptional}, {"radix", asValue, handleDynamicOptional}}}, /*isElemental=*/false}, - {"ieee_set_status", &I::genIeeeGetOrSetStatus}, + {"ieee_set_status", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_set_underflow_mode", &I::genIeeeSetUnderflowMode}, {"ieee_signaling_eq", &I::genIeeeSignalingCompare}, @@ -4108,11 +4113,12 @@ void IntrinsicLibrary::genRaiseExcept(int excepts, mlir::Value cond) { // Return a reference to the contents of a derived type with one field. // Also return the field type. static std::pair -getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec) { +getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec, + unsigned index = 0) { auto recType = mlir::dyn_cast(fir::unwrapPassByRefType(rec.getType())); - assert(recType.getTypeList().size() == 1 && "expected exactly one component"); - auto [fieldName, fieldTy] = recType.getTypeList().front(); + assert(index < recType.getTypeList().size() && "not enough components"); + auto [fieldName, fieldTy] = recType.getTypeList()[index]; mlir::Value field = builder.create( loc, fir::FieldType::get(recType.getContext()), fieldName, recType, fir::getTypeParams(rec)); @@ -4502,15 +4508,60 @@ void IntrinsicLibrary::genIeeeGetHaltingMode( } // IEEE_GET_MODES, IEEE_SET_MODES -template -void IntrinsicLibrary::genIeeeGetOrSetModes( +// IEEE_GET_STATUS, IEEE_SET_STATUS +template +void IntrinsicLibrary::genIeeeGetOrSetModesOrStatus( llvm::ArrayRef args) { assert(args.size() == 1); - mlir::Type ptrTy = builder.getRefType(builder.getIntegerType(32)); +#ifndef __GLIBC_USE_IEC_60559_BFP_EXT // only use of "#include " + // No definitions of fegetmode, fesetmode + llvm::StringRef func = isModes + ? (isGet ? "ieee_get_modes" : "ieee_set_modes") + : (isGet ? "ieee_get_status" : "ieee_set_status"); + TODO(loc, "intrinsic module procedure: " + func); +#else mlir::Type i32Ty = builder.getIntegerType(32); - mlir::Value addr = - builder.create(loc, ptrTy, getBase(args[0])); - genRuntimeCall(isGet ? "fegetmode" : "fesetmode", i32Ty, addr); + mlir::Type i64Ty = builder.getIntegerType(64); + mlir::Type ptrTy = builder.getRefType(i32Ty); + mlir::Value addr; + if (fir::getTargetTriple(builder.getModule()).isSPARC()) { + // Floating point environment data is larger than the __data field + // allotment. Allocate data space from the heap. + auto [fieldRef, fieldTy] = + getFieldRef(builder, loc, fir::getBase(args[0]), 1); + addr = builder.create( + loc, builder.create(loc, fieldRef)); + mlir::Type heapTy = addr.getType(); + mlir::Value allocated = builder.create( + loc, mlir::arith::CmpIPredicate::ne, + builder.createConvert(loc, i64Ty, addr), + builder.createIntegerConstant(loc, i64Ty, 0)); + auto ifOp = builder.create(loc, heapTy, allocated, + /*withElseRegion=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + builder.create(loc, addr); + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + mlir::Value byteSize = + isModes ? fir::runtime::genGetModesTypeSize(builder, loc) + : fir::runtime::genGetStatusTypeSize(builder, loc); + byteSize = builder.createConvert(loc, builder.getIndexType(), byteSize); + addr = + builder.create(loc, extractSequenceType(heapTy), + /*typeparams=*/std::nullopt, byteSize); + mlir::Value shape = builder.create(loc, byteSize); + builder.create( + loc, builder.create(loc, fieldTy, addr, shape), fieldRef); + builder.create(loc, addr); + builder.setInsertionPointAfter(ifOp); + addr = builder.create(loc, ptrTy, ifOp.getResult(0)); + } else { + // Place floating point environment data in __data storage. + addr = builder.create(loc, ptrTy, getBase(args[0])); + } + llvm::StringRef func = isModes ? (isGet ? "fegetmode" : "fesetmode") + : (isGet ? "fegetenv" : "fesetenv"); + genRuntimeCall(func, i32Ty, addr); +#endif } // Check that an explicit ieee_[get|set]_rounding_mode call radix value is 2. @@ -4543,18 +4594,6 @@ void IntrinsicLibrary::genIeeeGetRoundingMode( builder.create(loc, mode, fieldRef); } -// IEEE_GET_STATUS, IEEE_SET_STATUS -template -void IntrinsicLibrary::genIeeeGetOrSetStatus( - llvm::ArrayRef args) { - assert(args.size() == 1); - mlir::Type ptrTy = builder.getRefType(builder.getIntegerType(32)); - mlir::Type i32Ty = builder.getIntegerType(32); - mlir::Value addr = - builder.create(loc, ptrTy, getBase(args[0])); - genRuntimeCall(isGet ? "fegetenv" : "fesetenv", i32Ty, addr); -} - // IEEE_GET_UNDERFLOW_MODE void IntrinsicLibrary::genIeeeGetUnderflowMode( llvm::ArrayRef args) { diff --git a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp index 630281fdb593d..c545b3d00b4d7 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp @@ -42,3 +42,17 @@ void fir::runtime::genSetUnderflowMode(fir::FirOpBuilder &builder, fir::runtime::getRuntimeFunc(loc, builder)}; builder.create(loc, func, flag); } + +mlir::Value fir::runtime::genGetModesTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc) { + mlir::func::FuncOp func{ + fir::runtime::getRuntimeFunc(loc, builder)}; + return builder.create(loc, func).getResult(0); +} + +mlir::Value fir::runtime::genGetStatusTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc) { + mlir::func::FuncOp func{ + fir::runtime::getRuntimeFunc(loc, builder)}; + return builder.create(loc, func).getResult(0); +} diff --git a/flang/module/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90 index 6691012eda238..3ac9b993186aa 100644 --- a/flang/module/__fortran_ieee_exceptions.f90 +++ b/flang/module/__fortran_ieee_exceptions.f90 @@ -36,13 +36,15 @@ ieee_all(*) = [ ieee_usual, ieee_underflow, ieee_inexact ] type, public :: ieee_modes_type ! Fortran 2018, 17.7 - private ! opaque fenv.h femode_t data + private ! opaque fenv.h femode_t data; code will access only one component integer(kind=4) :: __data(_FORTRAN_RUNTIME_IEEE_FEMODE_T_EXTENT) + integer(kind=1), allocatable :: __allocatable_data(:) end type ieee_modes_type type, public :: ieee_status_type ! Fortran 2018, 17.7 - private ! opaque fenv.h fenv_t data + private ! opaque fenv.h fenv_t data; code will access only one component integer(kind=4) :: __data(_FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT) + integer(kind=1), allocatable :: __allocatable_data(:) end type ieee_status_type ! Define specifics with 1 LOGICAL or REAL argument for generic G. diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp index 2fa2baa2ec84a..f541b8e844ade 100644 --- a/flang/runtime/exceptions.cpp +++ b/flang/runtime/exceptions.cpp @@ -15,14 +15,10 @@ #include #endif -// When not supported, these macro are undefined in cfenv.h, -// set them to zero in that case. +// fenv.h may not define exception macros. #ifndef FE_INVALID #define FE_INVALID 0 #endif -#ifndef __FE_DENORM -#define __FE_DENORM 0 // denorm is nonstandard -#endif #ifndef FE_DIVBYZERO #define FE_DIVBYZERO 0 #endif @@ -46,7 +42,11 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { Terminator terminator{__FILE__, __LINE__}; static constexpr uint32_t v{FE_INVALID}; - static constexpr uint32_t s{__FE_DENORM}; // subnormal +#if __x86_64__ + static constexpr uint32_t s{__FE_DENORM}; // nonstandard, not a #define +#else + static constexpr uint32_t s{0}; +#endif static constexpr uint32_t z{FE_DIVBYZERO}; static constexpr uint32_t o{FE_OVERFLOW}; static constexpr uint32_t u{FE_UNDERFLOW}; @@ -62,25 +62,13 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { static constexpr uint32_t map[]{xm}; static constexpr uint32_t mapSize{sizeof(map) / sizeof(uint32_t)}; static_assert(mapSize == 64); - if (excepts == 0 || excepts >= mapSize) { + if (excepts >= mapSize) { terminator.Crash("Invalid excepts value: %d", excepts); } uint32_t except_value = map[excepts]; - if (except_value == 0) { - terminator.Crash( - "Excepts value %d not supported by flang runtime", excepts); - } return except_value; } -// Verify that the size of ieee_modes_type and ieee_status_type objects from -// intrinsic module file __fortran_ieee_exceptions.f90 are large enough to -// hold fenv_t object. -// TODO: fenv_t can be way larger than -// sizeof(int) * _FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT -// on some systems, e.g. Solaris, so omit object size comparison for now. -// TODO: consider femode_t object size comparison once its more mature. - // Check if the processor has the ability to control whether to halt or // continue execution when a given exception is raised. bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { @@ -103,7 +91,7 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { } bool RTNAME(GetUnderflowMode)(void) { -#if __x86_64__ +#if _MM_FLUSH_ZERO_MASK // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF; @@ -112,12 +100,23 @@ bool RTNAME(GetUnderflowMode)(void) { #endif } void RTNAME(SetUnderflowMode)(bool flag) { -#if __x86_64__ +#if _MM_FLUSH_ZERO_MASK // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON); #endif } +size_t RTNAME(GetModesTypeSize)(void) { +#ifdef __GLIBC_USE_IEC_60559_BFP_EXT + return sizeof(femode_t); // byte size of ieee_modes_type data +#else + return 8; // femode_t is not defined +#endif +} +size_t RTNAME(GetStatusTypeSize)(void) { + return sizeof(fenv_t); // byte size of ieee_status_type data +} + } // extern "C" } // namespace Fortran::runtime diff --git a/flang/test/Lower/Intrinsics/ieee_femodes.f90 b/flang/test/Lower/Intrinsics/ieee_femodes.f90 deleted file mode 100644 index abb264cb027ea..0000000000000 --- a/flang/test/Lower/Intrinsics/ieee_femodes.f90 +++ /dev/null @@ -1,82 +0,0 @@ -! RUN: bbc -emit-fir -o - %s | FileCheck %s - -! CHECK-LABEL: c.func @_QQmain -program m - use ieee_arithmetic - use ieee_exceptions - - ! CHECK: %[[VAL_69:.*]] = fir.alloca !fir.type<_QM__fortran_ieee_exceptionsTieee_modes_type{_QM__fortran_ieee_exceptionsTieee_modes_type.__data:!fir.array<2xi32>}> {bindc_name = "modes", uniq_name = "_QFEmodes"} - ! CHECK: %[[VAL_70:.*]] = fir.declare %[[VAL_69]] {uniq_name = "_QFEmodes"} : (!fir.ref}>>) -> !fir.ref}>> - type(ieee_modes_type) :: modes - - ! CHECK: %[[VAL_71:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}> {bindc_name = "round", uniq_name = "_QFEround"} - ! CHECK: %[[VAL_72:.*]] = fir.declare %[[VAL_71]] {uniq_name = "_QFEround"} : (!fir.ref>) -> !fir.ref> - type(ieee_round_type) :: round - - ! CHECK: %[[VAL_78:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0) : !fir.ref> - ! CHECK: %[[VAL_79:.*]] = fir.declare %[[VAL_78]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0"} : (!fir.ref>) -> !fir.ref> - - ! CHECK: %[[VAL_80:.*]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_round_type.mode, !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}> - ! CHECK: %[[VAL_81:.*]] = fir.coordinate_of %[[VAL_79]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_82:.*]] = fir.load %[[VAL_81]] : !fir.ref - ! CHECK: %[[VAL_83:.*]] = fir.convert %[[VAL_82]] : (i8) -> i32 - ! CHECK: fir.call @llvm.set.rounding(%[[VAL_83]]) fastmath : (i32) -> () - call ieee_set_rounding_mode(ieee_up) - - ! CHECK: %[[VAL_84:.*]] = fir.coordinate_of %[[VAL_72]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_85:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_85]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_86]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [up ] : ', mode_name(round) - - ! CHECK: %[[VAL_103:.*]] = fir.convert %[[VAL_70]] : (!fir.ref}>>) -> !fir.ref - ! CHECK: %[[VAL_104:.*]] = fir.call @fegetmode(%[[VAL_103]]) fastmath : (!fir.ref) -> i32 - call ieee_get_modes(modes) - - ! CHECK: %[[VAL_105:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1) : !fir.ref> - ! CHECK: %[[VAL_106:.*]] = fir.declare %[[VAL_105]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1"} : (!fir.ref>) -> !fir.ref> - ! CHECK: %[[VAL_107:.*]] = fir.coordinate_of %[[VAL_106]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_108:.*]] = fir.load %[[VAL_107]] : !fir.ref - ! CHECK: %[[VAL_109:.*]] = fir.convert %[[VAL_108]] : (i8) -> i32 - ! CHECK: fir.call @llvm.set.rounding(%[[VAL_109]]) fastmath : (i32) -> () - call ieee_set_rounding_mode(ieee_to_zero) - - ! CHECK: %[[VAL_110:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_111:.*]] = fir.convert %[[VAL_110]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_111]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [to_zero] : ', mode_name(round) - - ! CHECK: %[[VAL_126:.*]] = fir.call @fesetmode(%[[VAL_103]]) fastmath : (!fir.ref) -> i32 - call ieee_set_modes(modes) - - ! CHECK: %[[VAL_127:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_128:.*]] = fir.convert %[[VAL_127]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_128]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [up ] : ', mode_name(round) - -contains - character(7) function mode_name(m) - type(ieee_round_type), intent(in) :: m - if (m == ieee_nearest) then - mode_name = 'nearest' - else if (m == ieee_to_zero) then - mode_name = 'to_zero' - else if (m == ieee_up) then - mode_name = 'up' - else if (m == ieee_down) then - mode_name = 'down' - else if (m == ieee_away) then - mode_name = 'away' - else if (m == ieee_other) then - mode_name = 'other' - else - mode_name = '???' - endif - end -end diff --git a/flang/test/Lower/Intrinsics/ieee_festatus.f90 b/flang/test/Lower/Intrinsics/ieee_festatus.f90 deleted file mode 100644 index 66b1472101ef7..0000000000000 --- a/flang/test/Lower/Intrinsics/ieee_festatus.f90 +++ /dev/null @@ -1,120 +0,0 @@ -! RUN: bbc -emit-fir -o - %s | FileCheck %s - -! CHECK-LABEL: c.func @_QQmain -program s - use ieee_arithmetic - - ! CHECK: %[[V_0:[0-9]+]] = fir.address_of(@_QM__fortran_ieee_exceptionsECieee_all) : !fir.ref>> - ! CHECK: %[[V_1:[0-9]+]] = fir.shape %c5{{.*}} : (index) -> !fir.shape<1> - ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_0]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QM__fortran_ieee_exceptionsECieee_all"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: %[[V_53:[0-9]+]] = fir.address_of(@_QM__fortran_ieee_exceptionsECieee_usual) : !fir.ref>> - ! CHECK: %[[V_54:[0-9]+]] = fir.shape %c3{{.*}} : (index) -> !fir.shape<1> - ! CHECK: %[[V_55:[0-9]+]] = fir.declare %[[V_53]](%[[V_54]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QM__fortran_ieee_exceptionsECieee_usual"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - use ieee_exceptions - - ! CHECK: %[[V_56:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_ieee_exceptionsTieee_status_type{_QM__fortran_ieee_exceptionsTieee_status_type.__data:!fir.array<8xi32>}> {bindc_name = "status", uniq_name = "_QFEstatus"} - ! CHECK: %[[V_57:[0-9]+]] = fir.declare %[[V_56]] {uniq_name = "_QFEstatus"} : (!fir.ref}>>) -> !fir.ref}>> - type(ieee_status_type) :: status - - ! CHECK: %[[V_58:[0-9]+]] = fir.alloca !fir.array<5x!fir.logical<4>> {bindc_name = "v", uniq_name = "_QFEv"} - ! CHECK: %[[V_59:[0-9]+]] = fir.declare %[[V_58]](%[[V_1]]) {uniq_name = "_QFEv"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - logical :: v(size(ieee_all)) - - ! CHECK: %[[V_60:[0-9]+]] = fir.address_of(@_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0) : !fir.ref>> - ! CHECK: %[[V_61:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_61]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_97:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_96]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_98:[0-9]+]] = fir.load %[[V_97]] : !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.convert %[[V_98]] : (i8) -> i32 - ! CHECK: %[[V_100:[0-9]+]] = fir.call @_FortranAMapException(%[[V_99]]) fastmath : (i32) -> i32 - ! CHECK: fir.if %true{{[_0-9]*}} { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @feenableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } else { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @fedisableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } - ! CHECK: } - call ieee_set_halting_mode(ieee_all, .true.) - - ! CHECK: %[[V_62:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_62]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [T T T T T] :', v - - ! CHECK: %[[V_75:[0-9]+]] = fir.convert %[[V_57]] : (!fir.ref}>>) -> !fir.ref - ! CHECK: %[[V_76:[0-9]+]] = fir.call @fegetenv(%[[V_75]]) fastmath : (!fir.ref) -> i32 - call ieee_get_status(status) - - ! CHECK: %[[V_77:[0-9]+]] = fir.address_of(@_QQro.3x_QM__fortran_builtinsT__builtin_ieee_flag_type.1) : !fir.ref>> - ! CHECK: %[[V_78:[0-9]+]] = fir.declare %[[V_77]](%[[V_54]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.3x_QM__fortran_builtinsT__builtin_ieee_flag_type.1"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c3{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_78]](%[[V_54]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_97:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_96]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_98:[0-9]+]] = fir.load %[[V_97]] : !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.convert %[[V_98]] : (i8) -> i32 - ! CHECK: %[[V_100:[0-9]+]] = fir.call @_FortranAMapException(%[[V_99]]) fastmath : (i32) -> i32 - ! CHECK: fir.if %false{{[_0-9]*}} { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @feenableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } else { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @fedisableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } - ! CHECK: } - call ieee_set_halting_mode(ieee_usual, .false.) - - ! CHECK: %[[V_79:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_79]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [F F F T T] :', v - - ! CHECK: %[[V_87:[0-9]+]] = fir.call @fesetenv(%[[V_75]]) fastmath : (!fir.ref) -> i32 - ! CHECK: %[[V_88:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - call ieee_set_status(status) - - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_88]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [T T T T T] :', v -end From 3fd296ece69c0bab77ce0df9a128202746ffce94 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:55:28 -0800 Subject: [PATCH 75/82] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#123012) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect P to be nonnull. --- clang/lib/AST/ASTImporter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index dec4c7221bc77..0669aa1b809c3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -6376,7 +6376,7 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateSpecializationDecl( D2->setTemplateSpecializationKind(D->getTemplateSpecializationKind()); if (auto P = D->getInstantiatedFrom()) { - if (auto *CTD = P.dyn_cast()) { + if (auto *CTD = dyn_cast(P)) { if (auto CTDorErr = import(CTD)) D2->setInstantiationOf(*CTDorErr); } else { From 3a3a1e4627a37bdf5915b60fe375443bb280f23b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:55:52 -0800 Subject: [PATCH 76/82] [CodeGen] Migrate away from PointerUnion::dyn_cast (NFC) (#123013) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Data to be nonnull. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 244e3066f8fe4..ddcb04d53661d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -7768,7 +7768,7 @@ class MappableExprsHandler { &Data : RecordLayout) { if (Data.isNull()) continue; - if (const auto *Base = Data.dyn_cast()) + if (const auto *Base = dyn_cast(Data)) getPlainLayout(Base, Layout, /*AsBase=*/true); else Layout.push_back(cast(Data)); From acdcdbcad9b6008687570270cbdf2d6fe86318db Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:56:19 -0800 Subject: [PATCH 77/82] [Sema] Migrate away from PointerUnion::dyn_cast (NFC) (#123014) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect EnumUnderlying to be nonnull. --- clang/lib/Sema/SemaDecl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 704cb82b291cc..e0dd6039810cb 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -17310,7 +17310,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, return nullptr; if (EnumUnderlying) { EnumDecl *ED = cast(New); - if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast()) + if (TypeSourceInfo *TI = dyn_cast(EnumUnderlying)) ED->setIntegerTypeSourceInfo(TI); else ED->setIntegerType(QualType(cast(EnumUnderlying), 0)); @@ -17943,7 +17943,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, if (EnumUnderlying) { EnumDecl *ED = cast(New); - if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast()) + if (TypeSourceInfo *TI = dyn_cast(EnumUnderlying)) ED->setIntegerTypeSourceInfo(TI); else ED->setIntegerType(QualType(cast(EnumUnderlying), 0)); From 94e9813a20ed5885ae2908f8519f93d8082fd1f3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:56:45 -0800 Subject: [PATCH 78/82] [AsmParser] Avoid repeated map lookups (NFC) (#123015) --- llvm/lib/AsmParser/LLParser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 81d048b32e139..be6166f0c4169 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -941,8 +941,8 @@ bool LLParser::parseMDNodeID(MDNode *&Result) { return true; // If not a forward reference, just return it now. - if (NumberedMetadata.count(MID)) { - Result = NumberedMetadata[MID]; + if (auto It = NumberedMetadata.find(MID); It != NumberedMetadata.end()) { + Result = It->second; return false; } From ebb58567e79046afb28a4e657d1b1fd481a595a0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:57:04 -0800 Subject: [PATCH 79/82] [CodeGen] Avoid repeated hash lookups (NFC) (#123016) --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index ba1b10ec8b9b1..a3392b7110989 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7690,8 +7690,8 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // sunk instruction uses, if it is part of a chain that has already been // sunk. Instruction *OldI = cast(U->getUser()); - if (NewInstructions.count(OldI)) - NewInstructions[OldI]->setOperand(U->getOperandNo(), NI); + if (auto It = NewInstructions.find(OldI); It != NewInstructions.end()) + It->second->setOperand(U->getOperandNo(), NI); else U->set(NI); Changed = true; From 618ac908db72c312e1fbdfe974b4a084b4fc4c45 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 15 Jan 2025 07:57:30 -0800 Subject: [PATCH 80/82] [TableGen] Avoid repeated hash lookups (NFC) (#123018) --- llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp | 4 ++-- llvm/utils/TableGen/GlobalISelEmitter.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 149ba7a1d9032..bc300c3461100 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -2645,8 +2645,8 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef Rules) { for (RuleMatcher &Rule : Rules) { const StringRef Opcode = Rule.getOpcode(); assert(!Opcode.empty() && "Didn't expect an undefined opcode"); - if (OpcodeOrder.count(Opcode) == 0) - OpcodeOrder[Opcode] = CurrentOrdering++; + if (OpcodeOrder.try_emplace(Opcode, CurrentOrdering).second) + ++CurrentOrdering; } llvm::stable_sort(InputRules, [&OpcodeOrder](const Matcher *A, diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 3b334ea4ce152..04ebdbb0ffc90 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2216,8 +2216,8 @@ GlobalISelEmitter::buildMatchTable(MutableArrayRef Rules, for (RuleMatcher &Rule : Rules) { const StringRef Opcode = Rule.getOpcode(); assert(!Opcode.empty() && "Didn't expect an undefined opcode"); - if (OpcodeOrder.count(Opcode) == 0) - OpcodeOrder[Opcode] = CurrentOrdering++; + if (OpcodeOrder.try_emplace(Opcode, CurrentOrdering).second) + ++CurrentOrdering; } llvm::stable_sort( From 8ac35bda18229e28e1638756a5187aa2608a4b50 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Wed, 15 Jan 2025 10:58:25 -0500 Subject: [PATCH 81/82] [CUDA][HIP] Support CUID in new driver (#122859) CUID is needed by CUDA/HIP for supporting accessing static device variables in host function. Currently CUID is only supported by the old driver for CUDA/HIP. The new driver does not support it, which causes CUDA/HIP programs using static device variables in host functions to fail with the new driver for CUDA/HIP. This patch refactors the CUID support in the old driver so that CUID is supported by both the old and the new drivers for CUDA/HIP. --- clang/include/clang/Driver/Driver.h | 32 +++++++- clang/lib/Driver/Driver.cpp | 116 ++++++++++++++++------------ clang/test/Driver/hip-cuid.hip | 37 +++++++-- 3 files changed, 125 insertions(+), 60 deletions(-) diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 80bce574a3b64..e6d1e1f888f25 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -72,6 +72,29 @@ enum ModuleHeaderMode { HeaderMode_System }; +/// Options for specifying CUID used by CUDA/HIP for uniquely identifying +/// compilation units. +class CUIDOptions { +public: + enum class Kind { Hash, Random, Fixed, None, Invalid }; + + CUIDOptions() = default; + CUIDOptions(const CUIDOptions &) = default; + CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D); + + // Get the CUID for an input string + std::string getCUID(StringRef InputFile, + llvm::opt::DerivedArgList &Args) const; + + bool isEnabled() const { + return UseCUID != Kind::None && UseCUID != Kind::Invalid; + } + +private: + Kind UseCUID = Kind::None; + StringRef FixedCUID; +}; + /// Driver - Encapsulate logic for constructing compilation processes /// from a set of gcc-driver-like command line arguments. class Driver { @@ -119,6 +142,9 @@ class Driver { /// LTO mode selected via -f(no-offload-)?lto(=.*)? options. LTOKind OffloadLTOMode; + /// Options for CUID + CUIDOptions CUIDOpts; + public: enum OpenMPRuntimeKind { /// An unknown OpenMP runtime. We can't generate effective OpenMP code @@ -501,10 +527,11 @@ class Driver { /// \param C - The compilation that is being built. /// \param Args - The input arguments. /// \param Input - The input type and arguments + /// \param CUID - The CUID for \p Input /// \param HostAction - The host action used in the offloading toolchain. Action *BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, - const InputTy &Input, + const InputTy &Input, StringRef CUID, Action *HostAction) const; /// Returns the set of bound architectures active for this offload kind. @@ -728,6 +755,9 @@ class Driver { /// Get the specific kind of offload LTO being performed. LTOKind getOffloadLTOMode() const { return OffloadLTOMode; } + /// Get the CUID option. + const CUIDOptions &getCUIDOpts() const { return CUIDOpts; } + private: /// Tries to load options from configuration files. diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index eefbdca805739..7767c81d654dc 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -197,6 +197,50 @@ std::string Driver::GetResourcesPath(StringRef BinaryPath) { return std::string(P); } +CUIDOptions::CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D) + : UseCUID(Kind::Hash) { + if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) { + StringRef UseCUIDStr = A->getValue(); + UseCUID = llvm::StringSwitch(UseCUIDStr) + .Case("hash", Kind::Hash) + .Case("random", Kind::Random) + .Case("none", Kind::None) + .Default(Kind::Invalid); + if (UseCUID == Kind::Invalid) + D.Diag(clang::diag::err_drv_invalid_value) + << A->getAsString(Args) << UseCUIDStr; + } + + FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); + if (!FixedCUID.empty()) + UseCUID = Kind::Fixed; +} + +std::string CUIDOptions::getCUID(StringRef InputFile, + llvm::opt::DerivedArgList &Args) const { + std::string CUID = FixedCUID.str(); + if (CUID.empty()) { + if (UseCUID == Kind::Random) + CUID = llvm::utohexstr(llvm::sys::Process::GetRandomNumber(), + /*LowerCase=*/true); + else if (UseCUID == Kind::Hash) { + llvm::MD5 Hasher; + llvm::MD5::MD5Result Hash; + SmallString<256> RealPath; + llvm::sys::fs::real_path(InputFile, RealPath, + /*expand_tilde=*/true); + Hasher.update(RealPath); + for (auto *A : Args) { + if (A->getOption().matches(options::OPT_INPUT)) + continue; + Hasher.update(A->getAsString(Args)); + } + Hasher.final(Hash); + CUID = llvm::utohexstr(Hash.low(), /*LowerCase=*/true); + } + } + return CUID; +} Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple, DiagnosticsEngine &Diags, std::string Title, IntrusiveRefCntPtr VFS) @@ -875,6 +919,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, C.addOffloadDeviceToolChain(HIPTC, OFK); } + if (IsCuda || IsHIP) + CUIDOpts = CUIDOptions(C.getArgs(), *this); + // // OpenMP // @@ -3161,19 +3208,15 @@ class OffloadingActionBuilder final { /// Default GPU architecture if there's no one specified. OffloadArch DefaultOffloadArch = OffloadArch::UNKNOWN; - /// Method to generate compilation unit ID specified by option - /// '-fuse-cuid='. - enum UseCUIDKind { CUID_Hash, CUID_Random, CUID_None, CUID_Invalid }; - UseCUIDKind UseCUID = CUID_Hash; - - /// Compilation unit ID specified by option '-cuid='. - StringRef FixedCUID; + /// Compilation unit ID specified by option '-fuse-cuid=' or'-cuid='. + const CUIDOptions &CUIDOpts; public: CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind OFKind) - : DeviceActionBuilder(C, Args, Inputs, OFKind) { + : DeviceActionBuilder(C, Args, Inputs, OFKind), + CUIDOpts(C.getDriver().getCUIDOpts()) { CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, @@ -3204,28 +3247,8 @@ class OffloadingActionBuilder final { // Set the flag to true, so that the builder acts on the current input. IsActive = true; - std::string CUID = FixedCUID.str(); - if (CUID.empty()) { - if (UseCUID == CUID_Random) - CUID = llvm::utohexstr(llvm::sys::Process::GetRandomNumber(), - /*LowerCase=*/true); - else if (UseCUID == CUID_Hash) { - llvm::MD5 Hasher; - llvm::MD5::MD5Result Hash; - SmallString<256> RealPath; - llvm::sys::fs::real_path(IA->getInputArg().getValue(), RealPath, - /*expand_tilde=*/true); - Hasher.update(RealPath); - for (auto *A : Args) { - if (A->getOption().matches(options::OPT_INPUT)) - continue; - Hasher.update(A->getAsString(Args)); - } - Hasher.final(Hash); - CUID = llvm::utohexstr(Hash.low(), /*LowerCase=*/true); - } - } - IA->setId(CUID); + if (CUIDOpts.isEnabled()) + IA->setId(CUIDOpts.getCUID(IA->getInputArg().getValue(), Args)); if (CompileHostOnly) return ABRT_Success; @@ -3351,21 +3374,6 @@ class OffloadingActionBuilder final { CompileHostOnly = C.getDriver().offloadHostOnly(); EmitLLVM = Args.getLastArg(options::OPT_emit_llvm); EmitAsm = Args.getLastArg(options::OPT_S); - FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); - if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) { - StringRef UseCUIDStr = A->getValue(); - UseCUID = llvm::StringSwitch(UseCUIDStr) - .Case("hash", CUID_Hash) - .Case("random", CUID_Random) - .Case("none", CUID_None) - .Default(CUID_Invalid); - if (UseCUID == CUID_Invalid) { - C.getDriver().Diag(diag::err_drv_invalid_value) - << A->getAsString(Args) << UseCUIDStr; - C.setContainsError(); - return true; - } - } // --offload and --offload-arch options are mutually exclusive. if (Args.hasArgNoClaim(options::OPT_offload_EQ) && @@ -4366,6 +4374,12 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); + std::string CUID; + if (CUIDOpts.isEnabled() && types::isSrcFile(InputType)) { + CUID = CUIDOpts.getCUID(InputArg->getValue(), Args); + cast(Current)->setId(CUID); + } + // Use the current host action in any of the offloading actions, if // required. if (!UseNewOffloadingDriver) @@ -4429,7 +4443,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, // Try to build the offloading actions and add the result as a dependency // to the host. if (UseNewOffloadingDriver) - Current = BuildOffloadingActions(C, Args, I, Current); + Current = BuildOffloadingActions(C, Args, I, CUID, Current); // Use the current host action in any of the offloading actions, if // required. else if (OffloadBuilder->addHostDependenceToDeviceActions(Current, @@ -4766,7 +4780,7 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args, Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, - const InputTy &Input, + const InputTy &Input, StringRef CUID, Action *HostAction) const { // Don't build offloading actions if explicitly disabled or we do not have a // valid source input and compile action to embed it in. If preprocessing only @@ -4807,13 +4821,13 @@ Action *Driver::BuildOffloadingActions(Compilation &C, llvm::DenseSet Arches = getOffloadArchs(C, Args, Kind, TC); SmallVector Sorted(Arches.begin(), Arches.end()); llvm::sort(Sorted); - for (StringRef Arch : Sorted) + for (StringRef Arch : Sorted) { TCAndArchs.push_back(std::make_pair(TC, Arch)); + DeviceActions.push_back( + C.MakeAction(*InputArg, InputType, CUID)); + } } - for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) - DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); - if (DeviceActions.empty()) return HostAction; diff --git a/clang/test/Driver/hip-cuid.hip b/clang/test/Driver/hip-cuid.hip index 2e38c59ccf5ef..78c391c966e2a 100644 --- a/clang/test/Driver/hip-cuid.hip +++ b/clang/test/Driver/hip-cuid.hip @@ -80,16 +80,37 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=DEVICE %s +// Check cuid is supported by the new driver. +// RUN: %clang -### -x hip \ +// RUN: --target=x86_64-unknown-linux-gnu \ +// RUN: --no-offload-new-driver \ +// RUN: --offload-arch=gfx900 \ +// RUN: --offload-arch=gfx906 \ +// RUN: -c -nogpuinc -nogpulib --offload-new-driver \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,HEX %s + +// Check cuid is supported by CUDA by the default new driver. +// RUN: %clang -### -x cu \ +// RUN: --target=x86_64-unknown-linux-gnu \ +// RUN: --offload-arch=sm_60 \ +// RUN: --offload-arch=sm_70 \ +// RUN: -c -nogpuinc -nogpulib \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,HEX %s + // INVALID: invalid value 'invalid' in '-fuse-cuid=invalid' -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx900" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP:(amdgcn-amd-amdhsa|nvptx64-nvidia-cuda)]]" +// COMMON-SAME: "-target-cpu" "[[G1:(gfx900|sm_60)]]" // HEX-SAME: "-cuid=[[CUID:[0-9a-f]+]]" // FIXED-SAME: "-cuid=[[CUID:xyz_123]]" // COMMON-SAME: "{{.*}}a.cu" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx906" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G2:(gfx906|sm_70)]]" // COMMON-SAME: "-cuid=[[CUID]]" // COMMON-SAME: "{{.*}}a.cu" @@ -97,15 +118,15 @@ // COMMON-SAME: "-cuid=[[CUID]]" // COMMON-SAME: "{{.*}}a.cu" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx900" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G1]]" // HEX-NOT: "-cuid=[[CUID]]" // HEX-SAME: "-cuid=[[CUID2:[0-9a-f]+]]" // FIXED-SAME: "-cuid=[[CUID2:xyz_123]]" // COMMON-SAME: "{{.*}}b.hip" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx906" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G2]]" // HEX-NOT: "-cuid=[[CUID]]" // COMMON-SAME: "-cuid=[[CUID2]]" // COMMON-SAME: "{{.*}}b.hip" From 3986cffe81128061b774c06d0ba42ff7340f2d76 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Wed, 15 Jan 2025 11:03:33 -0500 Subject: [PATCH 82/82] [lldb] Add OpenBSD signals (#123005) Signals 1-32 are matching the default UNIX platform. There are platform specific ones above 32. --- .../Plugins/Process/Utility/CMakeLists.txt | 1 + .../Process/Utility/OpenBSDSignals.cpp | 69 +++++++++++++++++++ .../Plugins/Process/Utility/OpenBSDSignals.h | 27 ++++++++ lldb/source/Target/UnixSignals.cpp | 4 +- 4 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp create mode 100644 lldb/source/Plugins/Process/Utility/OpenBSDSignals.h diff --git a/lldb/source/Plugins/Process/Utility/CMakeLists.txt b/lldb/source/Plugins/Process/Utility/CMakeLists.txt index 0e1a5069d4409..f269f5d7d4d74 100644 --- a/lldb/source/Plugins/Process/Utility/CMakeLists.txt +++ b/lldb/source/Plugins/Process/Utility/CMakeLists.txt @@ -15,6 +15,7 @@ add_lldb_library(lldbPluginProcessUtility NativeRegisterContextDBReg_x86.cpp NativeRegisterContextRegisterInfo.cpp NetBSDSignals.cpp + OpenBSDSignals.cpp RegisterContext_x86.cpp RegisterContextDarwin_arm.cpp RegisterContextDarwin_arm64.cpp diff --git a/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp new file mode 100644 index 0000000000000..48263235126c0 --- /dev/null +++ b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp @@ -0,0 +1,69 @@ +//===-- OpenBSDSignals.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OpenBSDSignals.h" + +#ifdef __OpenBSD__ +#include + +#define ADD_SIGCODE(signal_name, signal_value, code_name, code_value, ...) \ + static_assert(signal_name == signal_value, \ + "Value mismatch for signal number " #signal_name); \ + static_assert(code_name == code_value, \ + "Value mismatch for signal code " #code_name); \ + AddSignalCode(signal_value, code_value, __VA_ARGS__) +#else +#define ADD_SIGCODE(signal_name, signal_value, code_name, code_value, ...) \ + AddSignalCode(signal_value, code_value, __VA_ARGS__) +#endif /* ifdef __OpenBSD */ + +using namespace lldb_private; + +OpenBSDSignals::OpenBSDSignals() : UnixSignals() { Reset(); } + +void OpenBSDSignals::Reset() { + UnixSignals::Reset(); + + // clang-format off + // SIGILL + ADD_SIGCODE(SIGILL, 4, ILL_ILLOPC, 1, "illegal opcode"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLOPN, 2, "illegal operand"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLADR, 3, "illegal addressing mode"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLTRP, 4, "illegal trap"); + ADD_SIGCODE(SIGILL, 4, ILL_PRVOPC, 5, "privileged opcode"); + ADD_SIGCODE(SIGILL, 4, ILL_PRVREG, 6, "privileged register"); + ADD_SIGCODE(SIGILL, 4, ILL_COPROC, 7, "coprocessor error"); + ADD_SIGCODE(SIGILL, 4, ILL_BADSTK, 8, "internal stack error"); + ADD_SIGCODE(SIGILL, 4, ILL_BTCFI, 9, "IBT missing on indirect call"); + + // SIGFPE + ADD_SIGCODE(SIGFPE, 8, FPE_INTDIV, 1, "integer divide by zero"); + ADD_SIGCODE(SIGFPE, 8, FPE_INTOVF, 2, "integer overflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTDIV, 3, "floating point divide by zero"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTOVF, 4, "floating point overflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTUND, 5, "floating point underflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTRES, 6, "floating point inexact result"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTINV, 7, "invalid floating point operation"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTSUB, 8, "subscript out of range"); + + // SIGBUS + ADD_SIGCODE(SIGBUS, 10, BUS_ADRALN, 1, "invalid address alignment"); + ADD_SIGCODE(SIGBUS, 10, BUS_ADRERR, 2, "non-existent physical address"); + ADD_SIGCODE(SIGBUS, 10, BUS_OBJERR, 3, "object specific hardware error"); + + // SIGSEGV + ADD_SIGCODE(SIGSEGV, 11, SEGV_MAPERR, 1, "address not mapped to object", + SignalCodePrintOption::Address); + ADD_SIGCODE(SIGSEGV, 11, SEGV_ACCERR, 2, "invalid permissions for mapped object", + SignalCodePrintOption::Address); + + // SIGNO NAME SUPPRESS STOP NOTIFY DESCRIPTION + // ===== ============== ======== ====== ====== ======================== + AddSignal(32, "SIGTHR", false, false, false, "thread library AST"); + // clang-format on +} diff --git a/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h new file mode 100644 index 0000000000000..1e2b1fa9d26db --- /dev/null +++ b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h @@ -0,0 +1,27 @@ +//===-- OpenBSDSignals.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H +#define LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H + +#include "lldb/Target/UnixSignals.h" + +namespace lldb_private { + +/// OpenBSD specific set of Unix signals. +class OpenBSDSignals : public UnixSignals { +public: + OpenBSDSignals(); + +private: + void Reset() override; +}; + +} // namespace lldb_private + +#endif // LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H diff --git a/lldb/source/Target/UnixSignals.cpp b/lldb/source/Target/UnixSignals.cpp index e3c7a83ece073..bee3a63818259 100644 --- a/lldb/source/Target/UnixSignals.cpp +++ b/lldb/source/Target/UnixSignals.cpp @@ -10,6 +10,7 @@ #include "Plugins/Process/Utility/FreeBSDSignals.h" #include "Plugins/Process/Utility/LinuxSignals.h" #include "Plugins/Process/Utility/NetBSDSignals.h" +#include "Plugins/Process/Utility/OpenBSDSignals.h" #include "lldb/Host/HostInfo.h" #include "lldb/Utility/ArchSpec.h" #include @@ -32,10 +33,11 @@ lldb::UnixSignalsSP UnixSignals::Create(const ArchSpec &arch) { case llvm::Triple::Linux: return std::make_shared(); case llvm::Triple::FreeBSD: - case llvm::Triple::OpenBSD: return std::make_shared(); case llvm::Triple::NetBSD: return std::make_shared(); + case llvm::Triple::OpenBSD: + return std::make_shared(); default: return std::make_shared(); }